@@ -376,7 +376,7 @@ def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> {
376
376
let HasSGPR = 1;
377
377
}
378
378
379
- def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
379
+ def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16 ], 16, (add M0_LO16)> {
380
380
let CopyCost = 1;
381
381
let Size = 16;
382
382
let isAllocatable = 0;
@@ -385,15 +385,15 @@ def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
385
385
386
386
// TODO: Do we need to set DwarfRegAlias on register tuples?
387
387
388
- def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
388
+ def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16 ], 16,
389
389
(add (sequence "SGPR%u_LO16", 0, 105))> {
390
390
let AllocationPriority = 0;
391
391
let Size = 16;
392
392
let GeneratePressureSet = 0;
393
393
let HasSGPR = 1;
394
394
}
395
395
396
- def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
396
+ def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16 ], 16,
397
397
(add (sequence "SGPR%u_HI16", 0, 105))> {
398
398
let isAllocatable = 0;
399
399
let Size = 16;
@@ -402,7 +402,7 @@ def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
402
402
}
403
403
404
404
// SGPR 32-bit registers
405
- def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
405
+ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 ], 32,
406
406
(add (sequence "SGPR%u", 0, 105))> {
407
407
// Give all SGPR classes higher priority than VGPR classes, because
408
408
// we want to spill SGPRs to VGPRs.
@@ -451,14 +451,14 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s"
451
451
def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;
452
452
453
453
// Trap handler TMP 32-bit registers
454
- def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
454
+ def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16, v2bf16 ], 32,
455
455
(add (sequence "TTMP%u", 0, 15))> {
456
456
let isAllocatable = 0;
457
457
let HasSGPR = 1;
458
458
}
459
459
460
460
// Trap handler TMP 16-bit registers
461
- def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
461
+ def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16 ], 16,
462
462
(add (sequence "TTMP%u_LO16", 0, 15))> {
463
463
let Size = 16;
464
464
let isAllocatable = 0;
@@ -584,8 +584,8 @@ class RegisterTypes<list<ValueType> reg_types> {
584
584
list<ValueType> types = reg_types;
585
585
}
586
586
587
- def Reg16Types : RegisterTypes<[i16, f16]>;
588
- def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
587
+ def Reg16Types : RegisterTypes<[i16, f16, bf16 ]>;
588
+ def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
589
589
590
590
let HasVGPR = 1 in {
591
591
// VOP3 and VINTERP can access 256 lo and 256 hi registers.
@@ -683,7 +683,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
683
683
}
684
684
685
685
// AccVGPR 32-bit registers
686
- def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
686
+ def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 ], 32,
687
687
(add (sequence "AGPR%u", 0, 255))> {
688
688
let AllocationPriority = 0;
689
689
let Size = 32;
@@ -735,15 +735,15 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
735
735
// Register classes used as source and destination
736
736
//===----------------------------------------------------------------------===//
737
737
738
- def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
738
+ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 ], 32,
739
739
(add FP_REG, SP_REG)> {
740
740
let isAllocatable = 0;
741
741
let CopyCost = -1;
742
742
let HasSGPR = 1;
743
743
let BaseClassOrder = 10000;
744
744
}
745
745
746
- def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32,
746
+ def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16 ], 32,
747
747
(add PRIVATE_RSRC_REG)> {
748
748
let isAllocatable = 0;
749
749
let CopyCost = -1;
@@ -760,7 +760,7 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
760
760
let GeneratePressureSet = 0, HasSGPR = 1 in {
761
761
// Subset of SReg_32 without M0 for SMRD instructions and alike.
762
762
// See comments in SIInstructions.td for more info.
763
- def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
763
+ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 , i1], 32,
764
764
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
765
765
SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO,
766
766
SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI,
@@ -769,7 +769,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2
769
769
let AllocationPriority = 0;
770
770
}
771
771
772
- def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
772
+ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16 ], 16,
773
773
(add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
774
774
XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16,
775
775
TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16,
@@ -782,39 +782,39 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
782
782
let BaseClassOrder = 16;
783
783
}
784
784
785
- def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
785
+ def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 , i1], 32,
786
786
(add SReg_32_XM0_XEXEC, M0_CLASS)> {
787
787
let AllocationPriority = 0;
788
788
}
789
789
790
- def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
790
+ def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 , i1], 32,
791
791
(add SReg_32_XEXEC, EXEC_LO)> {
792
792
let AllocationPriority = 0;
793
793
}
794
794
795
- def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
795
+ def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 , i1], 32,
796
796
(add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
797
797
let AllocationPriority = 0;
798
798
}
799
799
800
800
} // End GeneratePressureSet = 0
801
801
802
802
// Register class for all scalar registers (SGPRs + Special Registers)
803
- def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
803
+ def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 , i1], 32,
804
804
(add SReg_32_XM0, M0_CLASS)> {
805
805
let AllocationPriority = 0;
806
806
let HasSGPR = 1;
807
807
let BaseClassOrder = 32;
808
808
}
809
809
810
810
let GeneratePressureSet = 0 in {
811
- def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
811
+ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 ], 32,
812
812
(add SReg_32, LDS_DIRECT_CLASS)> {
813
813
let isAllocatable = 0;
814
814
let HasSGPR = 1;
815
815
}
816
816
817
- def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
817
+ def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16 ], 32,
818
818
(add SGPR_64Regs)> {
819
819
let CopyCost = 1;
820
820
let AllocationPriority = 1;
@@ -836,21 +836,21 @@ def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
836
836
let HasSGPR = 1;
837
837
}
838
838
839
- def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
839
+ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16 ], 32,
840
840
(add TTMP_64Regs)> {
841
841
let isAllocatable = 0;
842
842
let HasSGPR = 1;
843
843
}
844
844
845
- def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
845
+ def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16 ], 32,
846
846
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, SRC_SHARED_BASE,
847
847
SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> {
848
848
let CopyCost = 1;
849
849
let AllocationPriority = 1;
850
850
let HasSGPR = 1;
851
851
}
852
852
853
- def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
853
+ def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16 ], 32,
854
854
(add SReg_64_XEXEC, EXEC)> {
855
855
let CopyCost = 1;
856
856
let AllocationPriority = 1;
@@ -905,11 +905,11 @@ multiclass SRegClass<int numRegs,
905
905
}
906
906
907
907
defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
908
- defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
908
+ defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16 ], SGPR_128Regs, TTMP_128Regs>;
909
909
defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
910
910
defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
911
911
defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
912
- defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
912
+ defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16 ], SGPR_256Regs, TTMP_256Regs>;
913
913
defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>;
914
914
defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>;
915
915
defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
@@ -920,7 +920,7 @@ defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512
920
920
defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
921
921
}
922
922
923
- def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
923
+ def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 ], 32,
924
924
(add VGPR_32, LDS_DIRECT_CLASS)> {
925
925
let isAllocatable = 0;
926
926
let HasVGPR = 1;
@@ -955,15 +955,15 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
955
955
}
956
956
}
957
957
958
- defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
958
+ defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4],
959
959
(add VGPR_64)>;
960
960
defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
961
- defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>;
961
+ defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16 ], (add VGPR_128)>;
962
962
defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
963
963
964
964
defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
965
965
defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
966
- defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
966
+ defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16 ], (add VGPR_256)>;
967
967
defm VReg_288 : VRegClass<9, [v9i32, v9f32], (add VGPR_288)>;
968
968
defm VReg_320 : VRegClass<10, [v10i32, v10f32], (add VGPR_320)>;
969
969
defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
@@ -993,7 +993,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
993
993
defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
994
994
(add AGPR_64)>;
995
995
defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
996
- defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>;
996
+ defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16 ], (add AGPR_128)>;
997
997
defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
998
998
defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
999
999
defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
@@ -1032,14 +1032,14 @@ def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
1032
1032
let HasVGPR = 1;
1033
1033
}
1034
1034
1035
- def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
1035
+ def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 ], 32,
1036
1036
(add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
1037
1037
let isAllocatable = 0;
1038
1038
let HasVGPR = 1;
1039
1039
let HasSGPR = 1;
1040
1040
}
1041
1041
1042
- def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
1042
+ def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16 ], 32,
1043
1043
(add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
1044
1044
let isAllocatable = 0;
1045
1045
let HasVGPR = 1;
0 commit comments