Skip to content

Commit 47a5c36

Browse files
committed
[AMDGPU] Improve code size cost model (part 2)
Summary: Added estimations for ShuffleVector, some cast and arithmetic instructions Reviewers: rampitec Reviewed By: rampitec Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, zzheng, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D69629
1 parent 59f063b commit 47a5c36

File tree

12 files changed

+131
-20
lines changed

12 files changed

+131
-20
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 98 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -695,34 +695,114 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
695695

696696
unsigned GCNTTIImpl::getUserCost(const User *U,
697697
ArrayRef<const Value *> Operands) {
698-
// Estimate extractelement elimination
699-
if (const ExtractElementInst *EE = dyn_cast<ExtractElementInst>(U)) {
700-
ConstantInt *CI = dyn_cast<ConstantInt>(EE->getOperand(1));
698+
const Instruction *I = dyn_cast<Instruction>(U);
699+
if (!I)
700+
return BaseT::getUserCost(U, Operands);
701+
702+
// Estimate different operations to be optimized out
703+
switch (I->getOpcode()) {
704+
case Instruction::ExtractElement: {
705+
ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
701706
unsigned Idx = -1;
702707
if (CI)
703708
Idx = CI->getZExtValue();
704-
return getVectorInstrCost(EE->getOpcode(), EE->getOperand(0)->getType(),
705-
Idx);
709+
return getVectorInstrCost(I->getOpcode(), I->getOperand(0)->getType(), Idx);
706710
}
707-
708-
// Estimate insertelement elimination
709-
if (const InsertElementInst *IE = dyn_cast<InsertElementInst>(U)) {
710-
ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
711+
case Instruction::InsertElement: {
712+
ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
711713
unsigned Idx = -1;
712714
if (CI)
713715
Idx = CI->getZExtValue();
714-
return getVectorInstrCost(IE->getOpcode(), IE->getType(), Idx);
716+
return getVectorInstrCost(I->getOpcode(), I->getType(), Idx);
717+
}
718+
case Instruction::Call: {
719+
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
720+
SmallVector<Value *, 4> Args(II->arg_operands());
721+
FastMathFlags FMF;
722+
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
723+
FMF = FPMO->getFastMathFlags();
724+
return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
725+
FMF);
726+
} else {
727+
return BaseT::getUserCost(U, Operands);
728+
}
715729
}
730+
case Instruction::ShuffleVector: {
731+
const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
732+
Type *Ty = Shuffle->getType();
733+
Type *SrcTy = Shuffle->getOperand(0)->getType();
734+
735+
// TODO: Identify and add costs for insert subvector, etc.
736+
int SubIndex;
737+
if (Shuffle->isExtractSubvectorMask(SubIndex))
738+
return getShuffleCost(TTI::SK_ExtractSubvector, SrcTy, SubIndex, Ty);
739+
740+
if (Shuffle->changesLength())
741+
return -1;
742+
743+
if (Shuffle->isIdentity())
744+
return 0;
745+
746+
if (Shuffle->isReverse())
747+
return getShuffleCost(TTI::SK_Reverse, Ty, 0, nullptr);
716748

717-
// Estimate different intrinsics, e.g. llvm.fabs
718-
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
719-
SmallVector<Value *, 4> Args(II->arg_operands());
720-
FastMathFlags FMF;
721-
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
722-
FMF = FPMO->getFastMathFlags();
723-
return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
724-
FMF);
749+
if (Shuffle->isSelect())
750+
return getShuffleCost(TTI::SK_Select, Ty, 0, nullptr);
751+
752+
if (Shuffle->isTranspose())
753+
return getShuffleCost(TTI::SK_Transpose, Ty, 0, nullptr);
754+
755+
if (Shuffle->isZeroEltSplat())
756+
return getShuffleCost(TTI::SK_Broadcast, Ty, 0, nullptr);
757+
758+
if (Shuffle->isSingleSource())
759+
return getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, nullptr);
760+
761+
return getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, 0, nullptr);
762+
}
763+
case Instruction::ZExt:
764+
case Instruction::SExt:
765+
case Instruction::FPToUI:
766+
case Instruction::FPToSI:
767+
case Instruction::FPExt:
768+
case Instruction::PtrToInt:
769+
case Instruction::IntToPtr:
770+
case Instruction::SIToFP:
771+
case Instruction::UIToFP:
772+
case Instruction::Trunc:
773+
case Instruction::FPTrunc:
774+
case Instruction::BitCast:
775+
case Instruction::AddrSpaceCast: {
776+
return getCastInstrCost(I->getOpcode(), I->getType(),
777+
I->getOperand(0)->getType(), I);
725778
}
779+
case Instruction::Add:
780+
case Instruction::FAdd:
781+
case Instruction::Sub:
782+
case Instruction::FSub:
783+
case Instruction::Mul:
784+
case Instruction::FMul:
785+
case Instruction::UDiv:
786+
case Instruction::SDiv:
787+
case Instruction::FDiv:
788+
case Instruction::URem:
789+
case Instruction::SRem:
790+
case Instruction::FRem:
791+
case Instruction::Shl:
792+
case Instruction::LShr:
793+
case Instruction::AShr:
794+
case Instruction::And:
795+
case Instruction::Or:
796+
case Instruction::Xor:
797+
case Instruction::FNeg: {
798+
return getArithmeticInstrCost(I->getOpcode(), I->getType(),
799+
TTI::OK_AnyValue, TTI::OK_AnyValue,
800+
TTI::OP_None, TTI::OP_None, Operands);
801+
}
802+
default:
803+
break;
804+
}
805+
726806
return BaseT::getUserCost(U, Operands);
727807
}
728808

llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck %s
22
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck %s
3+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck %s
4+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck %s
5+
36

47
; CHECK: 'add_i32'
58
; CHECK: estimated cost of 1 for {{.*}} add i32

llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s
2+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s
23

34
; CHECK-LABEL: 'addrspacecast_global_to_flat'
45
; CHECK: estimated cost of 0 for {{.*}} addrspacecast i8 addrspace(1)* %ptr to i8*

llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
2+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
23

34
; CHECK: 'or_i32'
45
; CHECK: estimated cost of 1 for {{.*}} or i32

llvm/test/Analysis/CostModel/AMDGPU/fadd.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
22
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
3+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
4+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
35

46
; ALL: 'fadd_f32'
57
; ALL: estimated cost of 1 for {{.*}} fadd float

llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@
55
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s
66
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,FASTFP32DENORMS,FP16 %s
77

8+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
9+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
10+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
11+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
12+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s
13+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,FASTFP32DENORMS,FP16 %s
14+
815
; ALL: 'fdiv_f32'
916
; NOFP32DENORM: estimated cost of 12 for {{.*}} fdiv float
1017
; FP32DENORMS: estimated cost of 10 for {{.*}} fdiv float

llvm/test/Analysis/CostModel/AMDGPU/fmul.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
22
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
3+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
4+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
35

46
; ALL: 'fmul_f32'
57
; ALL: estimated cost of 1 for {{.*}} fmul float

llvm/test/Analysis/CostModel/AMDGPU/fsub.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
22
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
3+
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
4+
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
35

46
; ALL: 'fsub_f32'
57
; ALL: estimated cost of 1 for {{.*}} fsub float

llvm/test/Analysis/CostModel/AMDGPU/mul.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
2+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
23

34
; CHECK: 'mul_i32'
45
; CHECK: estimated cost of 3 for {{.*}} mul i32

llvm/test/Analysis/CostModel/AMDGPU/shifts.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=FAST64 %s
22
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=SLOW64 %s
3+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=FAST64 %s
4+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=SLOW64 %s
35

46
; ALL: 'shl_i32'
57
; ALL: estimated cost of 1 for {{.*}} shl i32

llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,48 @@
11
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GFX9,GCN %s
22
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=VI,GCN %s
3+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GFX9,GCN %s
4+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=VI,GCN %s
35

6+
; GCN-LABEL: 'shufflevector_00_v2i16'
47
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
8+
; VI: estimated cost of 1 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
59
define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
610
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
711
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
812
store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
913
ret void
1014
}
1115

12-
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
16+
; GCN-LABEL: 'shufflevector_01_v2i16'
17+
; GCN: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
1318
define amdgpu_kernel void @shufflevector_01_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
1419
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
1520
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
1621
store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
1722
ret void
1823
}
1924

25+
; GCN-LABEL: 'shufflevector_10_v2i16'
2026
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
27+
; VI: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
2128
define amdgpu_kernel void @shufflevector_10_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
2229
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
2330
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
2431
store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
2532
ret void
2633
}
2734

35+
; GCN-LABEL: 'shufflevector_11_v2i16'
2836
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
37+
; VI: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
2938
define amdgpu_kernel void @shufflevector_11_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
3039
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
3140
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
3241
store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
3342
ret void
3443
}
3544

45+
; GCN-LABEL: 'shufflevector_02_v2i16'
3646
; GCN: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
3747
define amdgpu_kernel void @shufflevector_02_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr0, <2 x i16> addrspace(1)* %vaddr1) {
3848
%vec0 = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr0

llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: opt -data-layout=A5 -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=20000 %s | FileCheck %s
1+
; RUN: opt -data-layout=A5 -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=12000 %s | FileCheck %s
22

33
; Check that we full unroll loop to be able to eliminate alloca
44
; CHECK-LABEL: @non_invariant_ind

0 commit comments

Comments
 (0)