Skip to content

Commit de74fc6

Browse files
committed
[AArch64][CostModel] Improve cost estimate of scalarizing a vector division
In the backend, last resort of finding the vector division cost is to use its scalar cost. However, without knowledge about the division operands, the cost can be off in certain cases. For SLP, this patch tries to pass scalars for better scalar cost estimation in the backend.
1 parent d02c167 commit de74fc6

File tree

3 files changed

+31
-32
lines changed

3 files changed

+31
-32
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3472,6 +3472,27 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
34723472
Cost *= 4;
34733473
return Cost;
34743474
} else {
3475+
// If the information about individual scalars being vectorized is
3476+
// available, this yeilds better cost estimation.
3477+
if (auto *VTy = dyn_cast<FixedVectorType>(Ty);
3478+
VTy && !Args.empty() && all_of(Args, [Opcode](const Value *V) {
3479+
auto *I = dyn_cast<Instruction>(V);
3480+
return I && I->getOpcode() == Opcode &&
3481+
!V->getType()->isVectorTy();
3482+
})) {
3483+
InstructionCost InsertExtractCost =
3484+
ST->getVectorInsertExtractBaseCost();
3485+
Cost = (3 * InsertExtractCost) * VTy->getNumElements();
3486+
for (auto *V : Args) {
3487+
auto *I = cast<Instruction>(V);
3488+
Cost +=
3489+
getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind,
3490+
TTI::getOperandInfo(I->getOperand(0)),
3491+
TTI::getOperandInfo(I->getOperand(1)));
3492+
}
3493+
return Cost;
3494+
}
3495+
34753496
// If one of the operands is a uniform constant then the cost for each
34763497
// element is Cost for insertion, extraction and division.
34773498
// Insertion cost = 2, Extraction Cost = 2, Division = cost for the

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11561,9 +11561,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1156111561
unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
1156211562
TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
1156311563
TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11564-
return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
11565-
Op2Info, {}, nullptr, TLI) +
11566-
CommonCost;
11564+
return CommonCost +
11565+
TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
11566+
Op2Info, E->Scalars, nullptr, TLI);
1156711567
};
1156811568
return GetCostDiff(GetScalarCost, GetVectorCost);
1156911569
}

llvm/test/Transforms/SLPVectorizer/AArch64/div.ll

Lines changed: 7 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -554,35 +554,13 @@ define <4 x i32> @slp_v4i32_Op1_unknown_Op2_const_pow2(<4 x i32> %a)
554554

555555
; computes (a/const + x - y) * z
556556
define <2 x i32> @vectorize_sdiv_v2i32(<2 x i32> %a, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
557-
; NO-SVE-LABEL: define <2 x i32> @vectorize_sdiv_v2i32(
558-
; NO-SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
559-
; NO-SVE-NEXT: [[A0:%.*]] = extractelement <2 x i32> [[A]], i64 0
560-
; NO-SVE-NEXT: [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1
561-
; NO-SVE-NEXT: [[TMP1:%.*]] = sdiv i32 [[A0]], 2
562-
; NO-SVE-NEXT: [[TMP2:%.*]] = sdiv i32 [[A1]], 4
563-
; NO-SVE-NEXT: [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0
564-
; NO-SVE-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1
565-
; NO-SVE-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]]
566-
; NO-SVE-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]]
567-
; NO-SVE-NEXT: [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0
568-
; NO-SVE-NEXT: [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1
569-
; NO-SVE-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]]
570-
; NO-SVE-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]]
571-
; NO-SVE-NEXT: [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0
572-
; NO-SVE-NEXT: [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1
573-
; NO-SVE-NEXT: [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]]
574-
; NO-SVE-NEXT: [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]]
575-
; NO-SVE-NEXT: [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
576-
; NO-SVE-NEXT: [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1
577-
; NO-SVE-NEXT: ret <2 x i32> [[RES1]]
578-
;
579-
; SVE-LABEL: define <2 x i32> @vectorize_sdiv_v2i32(
580-
; SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
581-
; SVE-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[A]], <i32 2, i32 4>
582-
; SVE-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]]
583-
; SVE-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]]
584-
; SVE-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]]
585-
; SVE-NEXT: ret <2 x i32> [[TMP4]]
557+
; CHECK-LABEL: define <2 x i32> @vectorize_sdiv_v2i32(
558+
; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
559+
; CHECK-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[A]], <i32 2, i32 4>
560+
; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]]
561+
; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]]
562+
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]]
563+
; CHECK-NEXT: ret <2 x i32> [[TMP4]]
586564
;
587565
{
588566
%a0 = extractelement <2 x i32> %a, i64 0

0 commit comments

Comments
 (0)