From d0057d22453e62c211439446e024abf7af71c086 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Fri, 19 Jan 2024 11:07:17 +0000 Subject: [PATCH 1/2] [LV] Fix handling of interleaving linear args Currently when interleaving vector calls with linear arguments, the Part is ignored and all vector calls use the initial value from the first lane of the current iteration. Fix this to extract from the correct part of the linear vector. --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 7 +-- .../AArch64/vector-call-linear-args.ll | 54 ++++++++++++------- 2 files changed, 39 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bbeb5da2cfec3..beb4e9b148cb7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -600,10 +600,11 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { // Some vectorized function variants may also take a scalar argument, // e.g. linear parameters for pointers. Value *Arg; - if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) || - (UseIntrinsic && - isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))) + if (UseIntrinsic && + isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) Arg = State.get(I.value(), VPIteration(0, 0)); + else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) + Arg = State.get(I.value(), VPIteration(Part, 0)); else Arg = State.get(I.value(), Part); if (UseIntrinsic && diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll index 29440ca174248..f60ab5e848dd3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll @@ -21,7 +21,8 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) { ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP5:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]]) -; NEON_INTERLEAVE: [[TMP6:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]]) +; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP3:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP6]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR0:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear8 @@ -34,8 +35,9 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) { ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; SVE_OR_NEON_INTERLEAVE: [[TMP33:%.*]] = extractelement [[TMP31:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: [[TMP34:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP47:%.*]] = extractelement [[TMP45:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP32:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP36:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP35]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_OR_NEON_INTERLEAVE: [[TMP48:%.*]] = extractelement [[TMP46:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear8 @@ -49,8 +51,9 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) { ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; SVE_TF_INTERLEAVE: [[TMP33:%.*]] = extractelement [[TMP31:%.*]], i32 0 ; SVE_TF_INTERLEAVE: [[TMP34:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP33]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP47:%.*]] = extractelement [[TMP45:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP32:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP36:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP35]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_TF_INTERLEAVE: [[TMP48:%.*]] = extractelement [[TMP46:%.*]], i32 0 ; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] ; entry: @@ -81,7 +84,8 @@ define void @test_vector_linear4(ptr noalias %a, ptr readnone %b, ptr readonly % ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP6:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP9:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP8]]) -; NEON_INTERLEAVE: [[TMP10:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD2:%.*]], ptr [[TMP8]]) +; NEON_INTERLEAVE: [[TMP10:%.*]] = extractelement <4 x ptr> [[TMP7:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP11:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD2:%.*]], ptr [[TMP10]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_vector_linear4 @@ -176,7 +180,8 @@ define void @test_linear16_wide_stride(ptr noalias %a, ptr readnone %b, i64 %n) ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP4:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]]) -; NEON_INTERLEAVE: [[TMP8:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]]) +; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP5:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP9:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP8]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]] ; ; SVE_OR_NEON-LABEL: define void @test_linear16_wide_stride @@ -228,7 +233,9 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly ; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP2:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP7:%.*]] = extractelement <4 x ptr> [[TMP4:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP8:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]]) -; NEON_INTERLEAVE: [[TMP9:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]]) +; NEON_INTERLEAVE: [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP3:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP10:%.*]] = extractelement <4 x ptr> [[TMP5:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP11:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP9]], ptr [[TMP10]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear4_linear8 @@ -243,8 +250,10 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly ; SVE_OR_NEON_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP31:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: [[TMP36:%.*]] = extractelement [[TMP33:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: [[TMP37:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP38:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP50:%.*]] = extractelement [[TMP48:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP38:%.*]] = extractelement [[TMP32:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP34:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP40:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP38]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_OR_NEON_INTERLEAVE: [[TMP52:%.*]] = extractelement [[TMP50:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear4_linear8 @@ -260,8 +269,10 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly ; SVE_TF_INTERLEAVE: [[TMP35:%.*]] = extractelement [[TMP31:%.*]], i32 0 ; SVE_TF_INTERLEAVE: [[TMP36:%.*]] = extractelement [[TMP33:%.*]], i32 0 ; SVE_TF_INTERLEAVE: [[TMP37:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP38:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP50:%.*]] = extractelement [[TMP48:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP38:%.*]] = extractelement [[TMP32:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP34:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP40:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP38]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_TF_INTERLEAVE: [[TMP52:%.*]] = extractelement [[TMP50:%.*]], i32 0 ; SVE_TF_INTERLEAVE: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]] ; entry: @@ -293,7 +304,8 @@ define void @test_linear3_non_ptr(ptr noalias %a, i64 %n) { ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP5:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]]) -; NEON_INTERLEAVE: [[TMP6:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]]) +; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP7:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP6]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR4:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear3_non_ptr @@ -343,7 +355,8 @@ define void @test_linearn5_non_ptr_neg_stride(ptr noalias %a, i64 %n) { ; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0 ; NEON_INTERLEAVE: [[TMP5:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]]) -; NEON_INTERLEAVE: [[TMP6:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]]) +; NEON_INTERLEAVE: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3:%.*]], i32 0 +; NEON_INTERLEAVE: [[TMP7:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP6]]) ; NEON_INTERLEAVE: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR5:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride @@ -393,7 +406,8 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n) ; NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) { ; NEON_INTERLEAVE: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP6:%.*]], i32 0 ; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP8]]) -; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD2:%.*]], ptr [[TMP8]]) +; NEON_INTERLEAVE: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP7:%.*]], i32 0 +; NEON_INTERLEAVE: call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD2:%.*]], ptr [[TMP9]]) ; NEON_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear8_return_void @@ -406,8 +420,9 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n) ; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SVE_OR_NEON_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP37:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_OR_NEON_INTERLEAVE: [[TMP45:%.*]] = extractelement [[TMP43:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: [[TMP40:%.*]] = extractelement [[TMP38:%.*]], i32 0 +; SVE_OR_NEON_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP40]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_OR_NEON_INTERLEAVE: [[TMP46:%.*]] = extractelement [[TMP44:%.*]], i32 0 ; SVE_OR_NEON_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear8_return_void @@ -421,8 +436,9 @@ define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n) ; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SVE_TF_INTERLEAVE: [[TMP39:%.*]] = extractelement [[TMP37:%.*]], i32 0 ; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], [[ACTIVE_LANE_MASK2:%.*]]) -; SVE_TF_INTERLEAVE: [[TMP45:%.*]] = extractelement [[TMP43:%.*]], i32 0 +; SVE_TF_INTERLEAVE: [[TMP40:%.*]] = extractelement [[TMP38:%.*]], i32 0 +; SVE_TF_INTERLEAVE: call void @vec_goo_linear8_mask_sve( [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP40]], [[ACTIVE_LANE_MASK2:%.*]]) +; SVE_TF_INTERLEAVE: [[TMP46:%.*]] = extractelement [[TMP44:%.*]], i32 0 ; SVE_TF_INTERLEAVE: call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]] ; entry: From 1bd29c48760cc1ac75543e9db05337c109222639 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 25 Jan 2024 14:04:44 +0000 Subject: [PATCH 2/2] Clarify comments on scalar params when executing recipe --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index beb4e9b148cb7..ae2fc522ba400 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -597,12 +597,13 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { for (const auto &I : enumerate(operands())) { // Some intrinsics have a scalar argument - don't replace it with a // vector. - // Some vectorized function variants may also take a scalar argument, - // e.g. linear parameters for pointers. Value *Arg; if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) Arg = State.get(I.value(), VPIteration(0, 0)); + // Some vectorized function variants may also take a scalar argument, + // e.g. linear parameters for pointers. This needs to be the scalar value + // from the start of the respective part when interleaving. else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) Arg = State.get(I.value(), VPIteration(Part, 0)); else