Skip to content

Commit 4df49ed

Browse files
mariusz-sikora-at-amdmatejaMarjanovicmbrkusanin
authored andcommitted
[AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (llvm#78414)
…bf8 instructions Add VOP1, VOP1_DPP8, VOP1_DPP16, VOP3, VOP3_DPP8, VOP3_DPP16 instructions that were supported on GFX940 (MI300): - V_CVT_F32_FP8 - V_CVT_F32_BF8 - V_CVT_PK_F32_FP8 - V_CVT_PK_F32_BF8 - V_CVT_PK_FP8_F32 - V_CVT_PK_BF8_F32 - V_CVT_SR_FP8_F32 - V_CVT_SR_BF8_F32 --------- Co-authored-by: Mateja Marjanovic <[email protected]> Co-authored-by: Mirko Brkušanin <[email protected]> (cherry picked from commit cfddb59)
1 parent aa4cb0e commit 4df49ed

34 files changed

+1742
-102
lines changed

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@
100100
// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
101101
// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
102102
// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
103-
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
104-
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
103+
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
104+
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
105105

106106
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
107107

clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,60 @@
11
// REQUIRES: amdgpu-registered-target
2-
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
2+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
34

45
typedef float v2f __attribute__((ext_vector_type(2)));
56

6-
// CHECK-GFX940-LABEL: @test_cvt_f32_bf8
7-
// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
7+
// CHECK-LABEL: @test_cvt_f32_bf8
8+
// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
89
void test_cvt_f32_bf8(global int* out, int a)
910
{
1011
*out = __builtin_amdgcn_cvt_f32_bf8(a, 0);
1112
}
1213

13-
// CHECK-GFX940-LABEL: @test_cvt_f32_fp8
14-
// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
14+
// CHECK-LABEL: @test_cvt_f32_fp8
15+
// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
1516
void test_cvt_f32_fp8(global int* out, int a)
1617
{
1718
*out = __builtin_amdgcn_cvt_f32_fp8(a, 1);
1819
}
1920

20-
// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8
21-
// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
21+
// CHECK-LABEL: @test_cvt_pk_f32_bf8
22+
// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
2223
void test_cvt_pk_f32_bf8(global v2f* out, int a)
2324
{
2425
*out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false);
2526
}
2627

27-
// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8
28-
// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
28+
// CHECK-LABEL: @test_cvt_pk_f32_fp8
29+
// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
2930
void test_cvt_pk_f32_fp8(global v2f* out, int a)
3031
{
3132
*out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true);
3233
}
3334

34-
// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32
35-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
35+
// CHECK-LABEL: @test_cvt_pk_bf8_f32
36+
// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
3637
void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b)
3738
{
3839
*out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false);
3940
}
4041

41-
// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32
42-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
42+
// CHECK-LABEL: @test_cvt_pk_fp8_f32
43+
// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
4344
void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b)
4445
{
4546
*out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true);
4647
}
4748

48-
// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32
49-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
49+
// CHECK-LABEL: @test_cvt_sr_bf8_f32
50+
// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
5051
void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b)
5152
{
5253
*out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2);
5354
}
5455

55-
// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32
56-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
56+
// CHECK-LABEL: @test_cvt_sr_fp8_f32
57+
// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
5758
void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b)
5859
{
5960
*out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3);

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1506,6 +1506,7 @@ def FeatureISAVersion12 : FeatureSet<
15061506
FeatureFlatAtomicFaddF32Inst,
15071507
FeatureImageInsts,
15081508
FeatureExtendedImageInsts,
1509+
FeatureFP8ConversionInsts,
15091510
FeaturePackedTID,
15101511
FeatureVcmpxPermlaneHazard,
15111512
FeatureSALUFloatInsts,

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3500,6 +3500,9 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
35003500
return !isInlineConstant(Inst, OpIdx);
35013501
} else if (MO.isReg()) {
35023502
auto Reg = MO.getReg();
3503+
if (!Reg) {
3504+
return false;
3505+
}
35033506
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
35043507
auto PReg = mc2PseudoReg(Reg);
35053508
return isSGPR(PReg, TRI) && PReg != SGPR_NULL;
@@ -8303,12 +8306,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
83038306
const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
83048307

83058308
if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
8306-
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
8309+
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
8310+
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
8311+
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
83078312
Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
83088313
Inst.addOperand(Inst.getOperand(0));
83098314
}
83108315

8311-
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) {
8316+
// Adding vdst_in operand is already covered for these DPP instructions in
8317+
// cvtVOP3DPP.
8318+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
8319+
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
8320+
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
8321+
Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
8322+
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
83128323
assert(!IsPacked);
83138324
Inst.addOperand(Inst.getOperand(0));
83148325
}
@@ -8770,6 +8781,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
87708781
}
87718782
}
87728783

8784+
int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
8785+
if (VdstInIdx == static_cast<int>(Inst.getNumOperands())) {
8786+
Inst.addOperand(Inst.getOperand(0));
8787+
}
8788+
8789+
bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
8790+
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
8791+
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
8792+
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
8793+
if (IsVOP3CvtSrDpp) {
8794+
if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
8795+
Inst.addOperand(MCOperand::createImm(0));
8796+
Inst.addOperand(MCOperand::createReg(0));
8797+
}
8798+
}
8799+
87738800
auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
87748801
MCOI::TIED_TO);
87758802
if (TiedTo != -1) {

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
712712
AMDGPU::OpName::src2_modifiers);
713713
}
714714

715+
if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
716+
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
717+
// Insert dummy unused src2_modifiers.
718+
insertNamedMCOperand(MI, MCOperand::createImm(0),
719+
AMDGPU::OpName::src2_modifiers);
720+
}
721+
715722
if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
716723
!AMDGPU::hasGDS(STI)) {
717724
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
@@ -942,6 +949,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
942949
// first add optional MI operands to check FI
943950
DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
944951
unsigned Opc = MI.getOpcode();
952+
945953
if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
946954
convertVOP3PDPPInst(MI);
947955
} else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
@@ -951,6 +959,15 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
951959
if (isMacDPP(MI))
952960
convertMacDPPInst(MI);
953961

962+
int VDstInIdx =
963+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
964+
if (VDstInIdx != -1)
965+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
966+
967+
if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
968+
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
969+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
970+
954971
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
955972
if (MI.getNumOperands() < DescNumOps &&
956973
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
@@ -977,6 +994,15 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
977994
if (isMacDPP(MI))
978995
convertMacDPPInst(MI);
979996

997+
int VDstInIdx =
998+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
999+
if (VDstInIdx != -1)
1000+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
1001+
1002+
if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
1003+
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
1004+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
1005+
9801006
unsigned Opc = MI.getOpcode();
9811007
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
9821008
if (MI.getNumOperands() < DescNumOps &&

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1305,6 +1305,16 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
13051305
const MCSubtargetInfo &STI,
13061306
raw_ostream &O) {
13071307
unsigned Opc = MI->getOpcode();
1308+
if (isCvt_F32_Fp8_Bf8_e64(Opc)) {
1309+
auto SrcMod =
1310+
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1311+
unsigned Mod = MI->getOperand(SrcMod).getImm();
1312+
unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0);
1313+
unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1);
1314+
if (Index0 || Index1)
1315+
O << " op_sel:[" << Index0 << ',' << Index1 << ']';
1316+
return;
1317+
}
13081318
if (isPermlane16(Opc)) {
13091319
auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
13101320
auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1684,8 +1684,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
16841684
!if(HasOMod,
16851685
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
16861686
clampmod0:$clamp, omod0:$omod),
1687-
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
1688-
clampmod0:$clamp))
1687+
!if (HasClamp,
1688+
(ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod0:$clamp),
1689+
(ins Src0Mod:$src0_modifiers, Src0RC:$src0)))
16891690
/* else */,
16901691
// VOP1 without modifiers
16911692
!if (HasClamp,
@@ -2279,6 +2280,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
22792280
field bit IsSingle = 0;
22802281
field bit IsWMMA = 0;
22812282

2283+
field bit IsFP8 = 0;
2284+
22822285
field bit HasDst = !ne(DstVT.Value, untyped.Value);
22832286
field bit HasDst32 = HasDst;
22842287
field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,17 @@ bool isPermlane16(unsigned Opc) {
529529
Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12;
530530
}
531531

532+
bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
533+
return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 ||
534+
Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 ||
535+
Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 ||
536+
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 ||
537+
Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 ||
538+
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 ||
539+
Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 ||
540+
Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12;
541+
}
542+
532543
bool isGenericAtomic(unsigned Opc) {
533544
return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
534545
Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,9 @@ bool isPermlane16(unsigned Opc);
535535
LLVM_READNONE
536536
bool isGenericAtomic(unsigned Opc);
537537

538+
LLVM_READNONE
539+
bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc);
540+
538541
namespace VOPD {
539542

540543
enum Component : unsigned {

0 commit comments

Comments
 (0)