Skip to content

Commit

Permalink
[LV] Fix handling of interleaving linear args (llvm#78725)
Browse files Browse the repository at this point in the history
Currently when interleaving vector calls with linear arguments,
the Part is ignored and all vector calls use the initial value
from the first lane of the current iteration.

Fix this to extract from the correct part of the linear vector.

(cherry picked from commit d4c0171)
  • Loading branch information
huntergr-arm authored and tstellar committed Feb 7, 2024
1 parent b82aea1 commit 6e9673f
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 24 deletions.
12 changes: 7 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -597,13 +597,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
for (const auto &I : enumerate(operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
// Some vectorized function variants may also take a scalar argument,
// e.g. linear parameters for pointers.
Value *Arg;
if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||
(UseIntrinsic &&
isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))
if (UseIntrinsic &&
isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
Arg = State.get(I.value(), VPIteration(0, 0));
// Some vectorized function variants may also take a scalar argument,
// e.g. linear parameters for pointers. This needs to be the scalar value
// from the start of the respective part when interleaving.
else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
Arg = State.get(I.value(), VPIteration(Part, 0));
else
Arg = State.get(I.value(), Part);
if (UseIntrinsic &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) {
; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) {
; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP5:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]])
; NEON_INTERLEAVE: [[TMP6:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]])
; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP3:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP6]])
; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR0:[0-9]+]]
;
; SVE_OR_NEON-LABEL: define void @test_linear8
Expand All @@ -34,8 +35,9 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) {
; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; SVE_OR_NEON_INTERLEAVE: [[TMP33:%.*]] = extractelement <vscale x 2 x ptr> [[TMP31:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: [[TMP34:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_OR_NEON_INTERLEAVE: [[TMP47:%.*]] = extractelement <vscale x 2 x i1> [[TMP45:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = extractelement <vscale x 2 x ptr> [[TMP32:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: [[TMP36:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP35]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_OR_NEON_INTERLEAVE: [[TMP48:%.*]] = extractelement <vscale x 2 x i1> [[TMP46:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
;
; SVE_TF-LABEL: define void @test_linear8
Expand All @@ -49,8 +51,9 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) {
; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; SVE_TF_INTERLEAVE: [[TMP33:%.*]] = extractelement <vscale x 2 x ptr> [[TMP31:%.*]], i32 0
; SVE_TF_INTERLEAVE: [[TMP34:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_TF_INTERLEAVE: [[TMP47:%.*]] = extractelement <vscale x 2 x i1> [[TMP45:%.*]], i32 0
; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = extractelement <vscale x 2 x ptr> [[TMP32:%.*]], i32 0
; SVE_TF_INTERLEAVE: [[TMP36:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP35]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_TF_INTERLEAVE: [[TMP48:%.*]] = extractelement <vscale x 2 x i1> [[TMP46:%.*]], i32 0
; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
;
entry:
Expand Down Expand Up @@ -81,7 +84,8 @@ define void @test_vector_linear4(ptr noalias %a, ptr readnone %b, ptr readonly %
; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) {
; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP6:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP9:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP8]])
; NEON_INTERLEAVE: [[TMP10:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD2:%.*]], ptr [[TMP8]])
; NEON_INTERLEAVE: [[TMP10:%.*]] = extractelement <4 x ptr> [[TMP7:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP11:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD2:%.*]], ptr [[TMP10]])
; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]]
;
; SVE_OR_NEON-LABEL: define void @test_vector_linear4
Expand Down Expand Up @@ -176,7 +180,8 @@ define void @test_linear16_wide_stride(ptr noalias %a, ptr readnone %b, i64 %n)
; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) {
; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP4:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]])
; NEON_INTERLEAVE: [[TMP8:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]])
; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP5:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP9:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP8]])
; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]]
;
; SVE_OR_NEON-LABEL: define void @test_linear16_wide_stride
Expand Down Expand Up @@ -228,7 +233,9 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly
; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP2:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP7:%.*]] = extractelement <4 x ptr> [[TMP4:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP8:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]])
; NEON_INTERLEAVE: [[TMP9:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]])
; NEON_INTERLEAVE: [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP3:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP10:%.*]] = extractelement <4 x ptr> [[TMP5:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP11:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP9]], ptr [[TMP10]])
; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]]
;
; SVE_OR_NEON-LABEL: define void @test_linear4_linear8
Expand All @@ -243,8 +250,10 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly
; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = extractelement <vscale x 4 x ptr> [[TMP31:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: [[TMP36:%.*]] = extractelement <vscale x 4 x ptr> [[TMP33:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: [[TMP37:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
; SVE_OR_NEON_INTERLEAVE: [[TMP38:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_OR_NEON_INTERLEAVE: [[TMP50:%.*]] = extractelement <vscale x 4 x i1> [[TMP48:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: [[TMP38:%.*]] = extractelement <vscale x 4 x ptr> [[TMP32:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: [[TMP39:%.*]] = extractelement <vscale x 4 x ptr> [[TMP34:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: [[TMP40:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP38]], ptr [[TMP39]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_OR_NEON_INTERLEAVE: [[TMP52:%.*]] = extractelement <vscale x 4 x i1> [[TMP50:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]]
;
; SVE_TF-LABEL: define void @test_linear4_linear8
Expand All @@ -260,8 +269,10 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly
; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = extractelement <vscale x 4 x ptr> [[TMP31:%.*]], i32 0
; SVE_TF_INTERLEAVE: [[TMP36:%.*]] = extractelement <vscale x 4 x ptr> [[TMP33:%.*]], i32 0
; SVE_TF_INTERLEAVE: [[TMP37:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
; SVE_TF_INTERLEAVE: [[TMP38:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_TF_INTERLEAVE: [[TMP50:%.*]] = extractelement <vscale x 4 x i1> [[TMP48:%.*]], i32 0
; SVE_TF_INTERLEAVE: [[TMP38:%.*]] = extractelement <vscale x 4 x ptr> [[TMP32:%.*]], i32 0
; SVE_TF_INTERLEAVE: [[TMP39:%.*]] = extractelement <vscale x 4 x ptr> [[TMP34:%.*]], i32 0
; SVE_TF_INTERLEAVE: [[TMP40:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP38]], ptr [[TMP39]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_TF_INTERLEAVE: [[TMP52:%.*]] = extractelement <vscale x 4 x i1> [[TMP50:%.*]], i32 0
; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]]
;
entry:
Expand Down Expand Up @@ -293,7 +304,8 @@ define void @test_linear3_non_ptr(ptr noalias %a, i64 %n) {
; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) {
; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP5:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]])
; NEON_INTERLEAVE: [[TMP6:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]])
; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP7:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP6]])
; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR4:[0-9]+]]
;
; SVE_OR_NEON-LABEL: define void @test_linear3_non_ptr
Expand Down Expand Up @@ -343,7 +355,8 @@ define void @test_linearn5_non_ptr_neg_stride(ptr noalias %a, i64 %n) {
; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) {
; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP5:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]])
; NEON_INTERLEAVE: [[TMP6:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]])
; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3:%.*]], i32 0
; NEON_INTERLEAVE: [[TMP7:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP6]])
; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR5:[0-9]+]]
;
; SVE_OR_NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride
Expand Down Expand Up @@ -393,7 +406,8 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n)
; NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) {
; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP6:%.*]], i32 0
; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP8]])
; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD2:%.*]], ptr [[TMP8]])
; NEON_INTERLEAVE: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP7:%.*]], i32 0
; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD2:%.*]], ptr [[TMP9]])
; NEON_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]]
;
; SVE_OR_NEON-LABEL: define void @test_linear8_return_void
Expand All @@ -406,8 +420,9 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n)
; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; SVE_OR_NEON_INTERLEAVE: [[TMP39:%.*]] = extractelement <vscale x 2 x ptr> [[TMP37:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_OR_NEON_INTERLEAVE: [[TMP45:%.*]] = extractelement <vscale x 2 x i1> [[TMP43:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: [[TMP40:%.*]] = extractelement <vscale x 2 x ptr> [[TMP38:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP40]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_OR_NEON_INTERLEAVE: [[TMP46:%.*]] = extractelement <vscale x 2 x i1> [[TMP44:%.*]], i32 0
; SVE_OR_NEON_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]]
;
; SVE_TF-LABEL: define void @test_linear8_return_void
Expand All @@ -421,8 +436,9 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n)
; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; SVE_TF_INTERLEAVE: [[TMP39:%.*]] = extractelement <vscale x 2 x ptr> [[TMP37:%.*]], i32 0
; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_TF_INTERLEAVE: [[TMP45:%.*]] = extractelement <vscale x 2 x i1> [[TMP43:%.*]], i32 0
; SVE_TF_INTERLEAVE: [[TMP40:%.*]] = extractelement <vscale x 2 x ptr> [[TMP38:%.*]], i32 0
; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP40]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
; SVE_TF_INTERLEAVE: [[TMP46:%.*]] = extractelement <vscale x 2 x i1> [[TMP44:%.*]], i32 0
; SVE_TF_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]]
;
entry:
Expand Down

0 comments on commit 6e9673f

Please sign in to comment.