Skip to content

Commit c1764a7

Browse files
authored
AMDGPU: Add bf16 vectors to register class definitions (#76214)
Assorted intrinsics are currently using i16 in place of a proper bfloat type, but they should really switch to bfloat. Note this only changes the type lists in tablegen, these are still not registered to be truly treated as a legal type yet. Depends #76213
1 parent 2eb0ac0 commit c1764a7

File tree

3 files changed

+52
-42
lines changed

3 files changed

+52
-42
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1487,8 +1487,18 @@ foreach Index = 0-31 in {
14871487
// 16-bit bitcast
14881488
def : BitConvert <i16, f16, VGPR_32>;
14891489
def : BitConvert <f16, i16, VGPR_32>;
1490+
def : BitConvert <f16, bf16, VGPR_32>;
1491+
def : BitConvert <bf16, f16, VGPR_32>;
1492+
14901493
def : BitConvert <i16, f16, SReg_32>;
14911494
def : BitConvert <f16, i16, SReg_32>;
1495+
def : BitConvert <f16, bf16, SReg_32>;
1496+
def : BitConvert <bf16, f16, SReg_32>;
1497+
1498+
def : BitConvert <i16, bf16, VGPR_32>;
1499+
def : BitConvert <bf16, i16, VGPR_32>;
1500+
def : BitConvert <i16, bf16, SReg_32>;
1501+
def : BitConvert <bf16, i16, SReg_32>;
14921502

14931503
// 32-bit bitcast
14941504
def : BitConvert <i32, f32, VGPR_32>;

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> {
376376
let HasSGPR = 1;
377377
}
378378

379-
def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
379+
def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add M0_LO16)> {
380380
let CopyCost = 1;
381381
let Size = 16;
382382
let isAllocatable = 0;
@@ -385,15 +385,15 @@ def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
385385

386386
// TODO: Do we need to set DwarfRegAlias on register tuples?
387387

388-
def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
388+
def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
389389
(add (sequence "SGPR%u_LO16", 0, 105))> {
390390
let AllocationPriority = 0;
391391
let Size = 16;
392392
let GeneratePressureSet = 0;
393393
let HasSGPR = 1;
394394
}
395395

396-
def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
396+
def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
397397
(add (sequence "SGPR%u_HI16", 0, 105))> {
398398
let isAllocatable = 0;
399399
let Size = 16;
@@ -402,7 +402,7 @@ def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
402402
}
403403

404404
// SGPR 32-bit registers
405-
def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
405+
def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
406406
(add (sequence "SGPR%u", 0, 105))> {
407407
// Give all SGPR classes higher priority than VGPR classes, because
408408
// we want to spill SGPRs to VGPRs.
@@ -451,14 +451,14 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s"
451451
def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;
452452

453453
// Trap handler TMP 32-bit registers
454-
def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
454+
def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16, v2bf16], 32,
455455
(add (sequence "TTMP%u", 0, 15))> {
456456
let isAllocatable = 0;
457457
let HasSGPR = 1;
458458
}
459459

460460
// Trap handler TMP 16-bit registers
461-
def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
461+
def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
462462
(add (sequence "TTMP%u_LO16", 0, 15))> {
463463
let Size = 16;
464464
let isAllocatable = 0;
@@ -584,8 +584,8 @@ class RegisterTypes<list<ValueType> reg_types> {
584584
list<ValueType> types = reg_types;
585585
}
586586

587-
def Reg16Types : RegisterTypes<[i16, f16]>;
588-
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
587+
def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
588+
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
589589

590590
let HasVGPR = 1 in {
591591
// VOP3 and VINTERP can access 256 lo and 256 hi registers.
@@ -683,7 +683,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
683683
}
684684

685685
// AccVGPR 32-bit registers
686-
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
686+
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
687687
(add (sequence "AGPR%u", 0, 255))> {
688688
let AllocationPriority = 0;
689689
let Size = 32;
@@ -735,15 +735,15 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
735735
// Register classes used as source and destination
736736
//===----------------------------------------------------------------------===//
737737

738-
def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
738+
def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
739739
(add FP_REG, SP_REG)> {
740740
let isAllocatable = 0;
741741
let CopyCost = -1;
742742
let HasSGPR = 1;
743743
let BaseClassOrder = 10000;
744744
}
745745

746-
def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32,
746+
def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16], 32,
747747
(add PRIVATE_RSRC_REG)> {
748748
let isAllocatable = 0;
749749
let CopyCost = -1;
@@ -760,7 +760,7 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
760760
let GeneratePressureSet = 0, HasSGPR = 1 in {
761761
// Subset of SReg_32 without M0 for SMRD instructions and alike.
762762
// See comments in SIInstructions.td for more info.
763-
def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
763+
def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
764764
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
765765
SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO,
766766
SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI,
@@ -769,7 +769,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2
769769
let AllocationPriority = 0;
770770
}
771771

772-
def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
772+
def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
773773
(add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
774774
XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16,
775775
TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16,
@@ -782,39 +782,39 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
782782
let BaseClassOrder = 16;
783783
}
784784

785-
def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
785+
def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
786786
(add SReg_32_XM0_XEXEC, M0_CLASS)> {
787787
let AllocationPriority = 0;
788788
}
789789

790-
def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
790+
def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
791791
(add SReg_32_XEXEC, EXEC_LO)> {
792792
let AllocationPriority = 0;
793793
}
794794

795-
def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
795+
def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
796796
(add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
797797
let AllocationPriority = 0;
798798
}
799799

800800
} // End GeneratePressureSet = 0
801801

802802
// Register class for all scalar registers (SGPRs + Special Registers)
803-
def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
803+
def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
804804
(add SReg_32_XM0, M0_CLASS)> {
805805
let AllocationPriority = 0;
806806
let HasSGPR = 1;
807807
let BaseClassOrder = 32;
808808
}
809809

810810
let GeneratePressureSet = 0 in {
811-
def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
811+
def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
812812
(add SReg_32, LDS_DIRECT_CLASS)> {
813813
let isAllocatable = 0;
814814
let HasSGPR = 1;
815815
}
816816

817-
def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
817+
def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16], 32,
818818
(add SGPR_64Regs)> {
819819
let CopyCost = 1;
820820
let AllocationPriority = 1;
@@ -836,21 +836,21 @@ def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
836836
let HasSGPR = 1;
837837
}
838838

839-
def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
839+
def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16], 32,
840840
(add TTMP_64Regs)> {
841841
let isAllocatable = 0;
842842
let HasSGPR = 1;
843843
}
844844

845-
def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
845+
def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
846846
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, SRC_SHARED_BASE,
847847
SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> {
848848
let CopyCost = 1;
849849
let AllocationPriority = 1;
850850
let HasSGPR = 1;
851851
}
852852

853-
def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
853+
def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
854854
(add SReg_64_XEXEC, EXEC)> {
855855
let CopyCost = 1;
856856
let AllocationPriority = 1;
@@ -905,11 +905,11 @@ multiclass SRegClass<int numRegs,
905905
}
906906

907907
defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
908-
defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
908+
defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], SGPR_128Regs, TTMP_128Regs>;
909909
defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
910910
defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
911911
defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
912-
defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
912+
defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs>;
913913
defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>;
914914
defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>;
915915
defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
@@ -920,7 +920,7 @@ defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512
920920
defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
921921
}
922922

923-
def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
923+
def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
924924
(add VGPR_32, LDS_DIRECT_CLASS)> {
925925
let isAllocatable = 0;
926926
let HasVGPR = 1;
@@ -955,15 +955,15 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
955955
}
956956
}
957957

958-
defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
958+
defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4],
959959
(add VGPR_64)>;
960960
defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
961-
defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>;
961+
defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add VGPR_128)>;
962962
defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
963963

964964
defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
965965
defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
966-
defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
966+
defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], (add VGPR_256)>;
967967
defm VReg_288 : VRegClass<9, [v9i32, v9f32], (add VGPR_288)>;
968968
defm VReg_320 : VRegClass<10, [v10i32, v10f32], (add VGPR_320)>;
969969
defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
@@ -993,7 +993,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
993993
defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
994994
(add AGPR_64)>;
995995
defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
996-
defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>;
996+
defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add AGPR_128)>;
997997
defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
998998
defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
999999
defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
@@ -1032,14 +1032,14 @@ def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
10321032
let HasVGPR = 1;
10331033
}
10341034

1035-
def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
1035+
def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
10361036
(add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
10371037
let isAllocatable = 0;
10381038
let HasVGPR = 1;
10391039
let HasSGPR = 1;
10401040
}
10411041

1042-
def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
1042+
def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
10431043
(add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
10441044
let isAllocatable = 0;
10451045
let HasVGPR = 1;

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -190,9 +190,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
190190
// because dealing with the write to high half of the register is
191191
// difficult.
192192
def : GCNPat <
193-
(build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
194-
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
195-
(f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
193+
(build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
194+
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
195+
(f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
196196
(v2f16 (mixhi_inst $src0_modifiers, $src0,
197197
$src1_modifiers, $src1,
198198
$src2_modifiers, $src2,
@@ -203,9 +203,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
203203
def : GCNPat <
204204
(build_vector
205205
f16:$elt0,
206-
(AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
206+
(AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
207207
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
208-
(f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
208+
(f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
209209
(v2f16 (mixhi_inst $src0_modifiers, $src0,
210210
$src1_modifiers, $src1,
211211
$src2_modifiers, $src2,
@@ -215,12 +215,12 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
215215

216216
def : GCNPat <
217217
(AMDGPUclamp (build_vector
218-
(fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
218+
(f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
219219
(f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
220-
(f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
221-
(fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
220+
(f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
221+
(f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
222222
(f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
223-
(f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
223+
(f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
224224
(v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
225225
$hi_src1_modifiers, $hi_src1,
226226
$hi_src2_modifiers, $hi_src2,
@@ -243,8 +243,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
243243
>;
244244

245245
def : GCNPat <
246-
(build_vector f16:$elt0, (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
247-
(f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
246+
(build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
247+
(f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
248248
(v2f16 (mixhi_inst $src0_modifiers, $src0,
249249
$src1_modifiers, $src1,
250250
(i32 0), (i32 0),

0 commit comments

Comments
 (0)