Skip to content

Commit ef4b597

Browse files
jthackraymomchil-velikovLukacma
authored
[AArch64] Add intrinsics for SME FP8 FDOT single and multi instructions (#119845)
Add support for the following SME 8 bit floating-point dot-product intrinsics: ``` // Only if __ARM_FEATURE_SME_F8F16 != 0 void svdot[_single]_za16[_mf8]_vg1x2_fpm(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); void svdot[_single]_za16[_mf8]_vg1x4_fpm(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); void svdot_za16[_mf8]_vg1x2_fpm(uint32_t slice, svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); void svdot_za16[_mf8]_vg1x4_fpm(uint32_t slice, svmfloat8x4_t zn, svmfloat8x4_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); // Only if __ARM_FEATURE_SME_F8F32 != 0 void svdot[_single]_za32[_mf8]_vg1x2_fpm(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); void svdot[_single]_za32[_mf8]_vg1x4_fpm(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); void svdot_za32[_mf8]_vg1x2_fpm(uint32_t slice, svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); void svdot_za32[_mf8]_vg1x4_fpm(uint32_t slice, svmfloat8x4_t zn, svmfloat8x4_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); ``` These intrinsics are extracted from: ARM-software/acle#323 Co-authored-by: Momchil Velikov <[email protected]> Co-authored-by: Marian Lukac <[email protected]>
1 parent 7d25bce commit ef4b597

File tree

7 files changed

+415
-16
lines changed

7 files changed

+415
-16
lines changed

clang/include/clang/Basic/arm_sme.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -748,11 +748,23 @@ let SMETargetGuard = "sme2" in {
748748
let SMETargetGuard = "sme-f8f32" in {
749749
def SVDOT_LANE_FP8_ZA32_VG1x2 : Inst<"svdot_lane_za32[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
750750
def SVDOT_LANE_FP8_ZA32_VG1x4 : Inst<"svdot_lane_za32[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
751+
752+
def SVDOT_SINGLE_FP8_ZA32_VG1x2 : Inst<"svdot[_single]_za32[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
753+
def SVDOT_SINGLE_FP8_ZA32_VG1x4 : Inst<"svdot[_single]_za32[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
754+
755+
def SVDOT_MULTI_FP8_ZA32_VG1x2 : Inst<"svdot_za32[_mf8]_vg1x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
756+
def SVDOT_MULTI_FP8_ZA32_VG1x4 : Inst<"svdot_za32[_mf8]_vg1x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
751757
}
752758

753759
let SMETargetGuard = "sme-f8f16" in {
754760
def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
755761
def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
762+
763+
def SVDOT_SINGLE_FP8_ZA16_VG1x2 : Inst<"svdot[_single]_za16[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
764+
def SVDOT_SINGLE_FP8_ZA16_VG1x4 : Inst<"svdot[_single]_za16[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
765+
766+
def SVDOT_MULTI_FP8_ZA16_VG1x2 : Inst<"svdot_za16[_mf8]_vg1x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
767+
def SVDOT_MULTI_FP8_ZA16_VG1x4 : Inst<"svdot_za16[_mf8]_vg1x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
756768
}
757769

758770
////////////////////////////////////////////////////////////////////////////////

clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c

Lines changed: 167 additions & 7 deletions
Large diffs are not rendered by default.

clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fp8_fdot.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,22 @@ void test_features(uint32_t slice, svmfloat8_t f8, svmfloat8x2_t f8x2,
1414
svdot_lane_za16_mf8_vg1x2_fpm(slice, f8x2, f8, 3, fpmr);
1515
// expected-error@+1 {{'svdot_lane_za16_mf8_vg1x4_fpm' needs target feature sme,sme-f8f16}}
1616
svdot_lane_za16_mf8_vg1x4_fpm(slice, f8x4, f8, 3, fpmr);
17+
// expected-error@+1 {{'svdot_single_za32_mf8_vg1x2_fpm' needs target feature sme,sme-f8f32}}
18+
svdot_single_za32_mf8_vg1x2_fpm(slice, f8x2, f8, fpmr);
19+
// expected-error@+1 {{'svdot_single_za32_mf8_vg1x4_fpm' needs target feature sme,sme-f8f32}}
20+
svdot_single_za32_mf8_vg1x4_fpm(slice, f8x4, f8, fpmr);
21+
// expected-error@+1 {{'svdot_za32_mf8_vg1x2_fpm' needs target feature sme,sme-f8f32}}
22+
svdot_za32_mf8_vg1x2_fpm(slice, f8x2, f8x2, fpmr);
23+
// expected-error@+1 {{'svdot_za32_mf8_vg1x4_fpm' needs target feature sme,sme-f8f32}}
24+
svdot_za32_mf8_vg1x4_fpm(slice, f8x4, f8x4, fpmr);
25+
// expected-error@+1 {{'svdot_single_za16_mf8_vg1x2_fpm' needs target feature sme,sme-f8f16}}
26+
svdot_single_za16_mf8_vg1x2_fpm(slice, f8x2, f8, fpmr);
27+
// expected-error@+1 {{'svdot_single_za16_mf8_vg1x4_fpm' needs target feature sme,sme-f8f16}}
28+
svdot_single_za16_mf8_vg1x4_fpm(slice, f8x4, f8, fpmr);
29+
// expected-error@+1 {{'svdot_za16_mf8_vg1x2_fpm' needs target feature sme,sme-f8f16}}
30+
svdot_za16_mf8_vg1x2_fpm(slice, f8x2, f8x2, fpmr);
31+
// expected-error@+1 {{'svdot_za16_mf8_vg1x4_fpm' needs target feature sme,sme-f8f16}}
32+
svdot_za16_mf8_vg1x4_fpm(slice, f8x4, f8x4, fpmr);
1733
}
1834

1935
void test_imm(uint32_t slice, svmfloat8_t f8, svmfloat8x2_t f8x2,

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3874,11 +3874,47 @@ class SME2_FP8_FDOT_LANE_VG1x4 :
38743874
llvm_i32_ty],
38753875
[IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>;
38763876

3877+
class SME2_FP8_FDOT_SINGLE_VG1x2 :
3878+
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
3879+
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
3880+
llvm_nxv16i8_ty],
3881+
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;
3882+
3883+
class SME2_FP8_FDOT_SINGLE_VG1x4 :
3884+
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
3885+
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
3886+
llvm_nxv16i8_ty],
3887+
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;
3888+
3889+
class SME2_FP8_FDOT_MULTI_VG1x2 :
3890+
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
3891+
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
3892+
llvm_nxv16i8_ty, llvm_nxv16i8_ty],
3893+
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;
3894+
3895+
class SME2_FP8_FDOT_MULTI_VG1x4 :
3896+
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
3897+
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
3898+
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
3899+
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;
3900+
38773901
def int_aarch64_sme_fp8_fdot_lane_za16_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2;
38783902
def int_aarch64_sme_fp8_fdot_lane_za16_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4;
38793903

38803904
def int_aarch64_sme_fp8_fdot_lane_za32_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2;
38813905
def int_aarch64_sme_fp8_fdot_lane_za32_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4;
3906+
3907+
def int_aarch64_sme_fp8_fdot_single_za16_vg1x2 : SME2_FP8_FDOT_SINGLE_VG1x2;
3908+
def int_aarch64_sme_fp8_fdot_single_za16_vg1x4 : SME2_FP8_FDOT_SINGLE_VG1x4;
3909+
3910+
def int_aarch64_sme_fp8_fdot_single_za32_vg1x2 : SME2_FP8_FDOT_SINGLE_VG1x2;
3911+
def int_aarch64_sme_fp8_fdot_single_za32_vg1x4 : SME2_FP8_FDOT_SINGLE_VG1x4;
3912+
3913+
def int_aarch64_sme_fp8_fdot_multi_za16_vg1x2 : SME2_FP8_FDOT_MULTI_VG1x2;
3914+
def int_aarch64_sme_fp8_fdot_multi_za16_vg1x4 : SME2_FP8_FDOT_MULTI_VG1x4;
3915+
3916+
def int_aarch64_sme_fp8_fdot_multi_za32_vg1x2 : SME2_FP8_FDOT_MULTI_VG1x2;
3917+
def int_aarch64_sme_fp8_fdot_multi_za32_vg1x4 : SME2_FP8_FDOT_MULTI_VG1x4;
38823918
}
38833919

38843920
//

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -988,11 +988,11 @@ let Predicates = [HasSMEF8F16] in {
988988
defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
989989
defm FDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fdot", int_aarch64_sme_fp8_fdot_lane_za16_vg1x2>;
990990
defm FDOT_VG4_M4ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x4<"fdot", int_aarch64_sme_fp8_fdot_lane_za16_vg1x4>;
991-
defm FDOT_VG2_M2ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>;
992-
defm FDOT_VG4_M4ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>;
993991

994-
defm FDOT_VG2_M2Z2Z_BtoH : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b0100100, MatrixOp16, ZZ_b_mul_r, nxv16i8, null_frag>;
995-
defm FDOT_VG4_M4Z4Z_BtoH : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b0100100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, null_frag>;
992+
defm FDOT_VG2_M2ZZ_BtoH : sme2_fp8_fdot_single_vg1x2<"fdot", 0b0010001, MatrixOp16, int_aarch64_sme_fp8_fdot_single_za16_vg1x2>;
993+
defm FDOT_VG4_M4ZZ_BtoH : sme2_fp8_fdot_single_vg1x4<"fdot", 0b0110001, MatrixOp16, int_aarch64_sme_fp8_fdot_single_za16_vg1x4>;
994+
defm FDOT_VG2_M2Z2Z_BtoH : sme2_fp8_fdot_multi_vg1x2 <"fdot", 0b0100100, MatrixOp16, int_aarch64_sme_fp8_fdot_multi_za16_vg1x2>;
995+
defm FDOT_VG4_M4Z4Z_BtoH : sme2_fp8_fdot_multi_vg1x4 <"fdot", 0b0100100, MatrixOp16, int_aarch64_sme_fp8_fdot_multi_za16_vg1x4>;
996996

997997
def FMLAL_MZZI_BtoH : sme2_mla_ll_array_index_16b<"fmlal", 0b11, 0b00>;
998998
defm FMLAL_VG2_M2ZZI_BtoH : sme2_multi_vec_array_vg2_index_16b<"fmlal", 0b10, 0b111>;
@@ -1011,11 +1011,10 @@ let Predicates = [HasSMEF8F32] in {
10111011
defm FDOT_VG2_M2ZZI_BtoS : sme2_fp8_fdot_index_za32_vg1x2<"fdot", int_aarch64_sme_fp8_fdot_lane_za32_vg1x2>;
10121012
defm FDOT_VG4_M4ZZI_BtoS : sme2_fp8_fdot_index_za32_vg1x4<"fdot", int_aarch64_sme_fp8_fdot_lane_za32_vg1x4>;
10131013

1014-
defm FDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010011, MatrixOp32, ZZ_b, ZPR4b8>;
1015-
defm FDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110011, MatrixOp32, ZZZZ_b, ZPR4b8>;
1016-
1017-
defm FDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b0100110, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>;
1018-
defm FDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b0100110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>;
1014+
defm FDOT_VG2_M2ZZ_BtoS : sme2_fp8_fdot_single_vg1x2<"fdot", 0b0010011, MatrixOp32, int_aarch64_sme_fp8_fdot_single_za32_vg1x2>;
1015+
defm FDOT_VG4_M4ZZ_BtoS : sme2_fp8_fdot_single_vg1x4<"fdot", 0b0110011, MatrixOp32, int_aarch64_sme_fp8_fdot_single_za32_vg1x4>;
1016+
defm FDOT_VG2_M2Z2Z_BtoS : sme2_fp8_fdot_multi_vg1x2 <"fdot", 0b0100110, MatrixOp32, int_aarch64_sme_fp8_fdot_multi_za32_vg1x2>;
1017+
defm FDOT_VG4_M4Z4Z_BtoS : sme2_fp8_fdot_multi_vg1x4 <"fdot", 0b0100110, MatrixOp32, int_aarch64_sme_fp8_fdot_multi_za32_vg1x4>;
10191018

10201019
def FVDOTB_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdotb", 0b0>;
10211020
def FVDOTT_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdott", 0b1>;

llvm/lib/Target/AArch64/SMEInstrFormats.td

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5882,3 +5882,67 @@ multiclass sme2_fp8_fdot_index_za32_vg1x4<string mnemonic,
58825882

58835883
def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, VectorIndexS32b_timm, tileslice16>;
58845884
}
5885+
5886+
multiclass sme2_fp8_fdot_single_vg1x2<string mnemonic, bits<7> op,
5887+
MatrixOperand matrix_op,
5888+
SDPatternOperator intrinsic> {
5889+
def NAME: sme2_dot_mla_add_sub_array_vg24_single<op, matrix_op, ZZ_b, ZPR4b8, mnemonic>,
5890+
SMEPseudo2Instr<NAME, 1> {
5891+
let Uses=[FPMR, FPCR];
5892+
}
5893+
5894+
def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
5895+
(!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZ_b:$Zn, ZPR4b8:$Zm), 0>;
5896+
5897+
def _PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME, sme_elm_idx0_7, ZZ_b, ZPR4b8, SMEMatrixArray>;
5898+
5899+
def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, tileslice16>;
5900+
}
5901+
5902+
multiclass sme2_fp8_fdot_single_vg1x4<string mnemonic, bits<7> op,
5903+
MatrixOperand matrix_op,
5904+
SDPatternOperator intrinsic> {
5905+
def NAME: sme2_dot_mla_add_sub_array_vg24_single<op, matrix_op, ZZZZ_b, ZPR4b8, mnemonic>,
5906+
SMEPseudo2Instr<NAME, 1> {
5907+
let Uses=[FPMR, FPCR];
5908+
}
5909+
5910+
def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
5911+
(!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZZZ_b:$Zn, ZPR4b8:$Zm), 0>;
5912+
5913+
def _PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME, sme_elm_idx0_7, ZZZZ_b, ZPR4b8, SMEMatrixArray>;
5914+
5915+
def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, tileslice16>;
5916+
}
5917+
5918+
multiclass sme2_fp8_fdot_multi_vg1x2<string mnemonic, bits<7> op,
5919+
MatrixOperand matrix_op,
5920+
SDPatternOperator intrinsic> {
5921+
def NAME : sme2_dot_mla_add_sub_array_vg2_multi<op, matrix_op, ZZ_b_mul_r, mnemonic>,
5922+
SMEPseudo2Instr<NAME, 1> {
5923+
let Uses=[FPMR, FPCR];
5924+
}
5925+
5926+
def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
5927+
(!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZ_b_mul_r:$Zn, ZZ_b_mul_r:$Zm), 0>;
5928+
5929+
def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, sme_elm_idx0_7, ZZ_b_mul_r, SMEMatrixArray>;
5930+
5931+
def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<NAME, intrinsic, sme_elm_idx0_7, nxv16i8, tileslice16>;
5932+
}
5933+
5934+
multiclass sme2_fp8_fdot_multi_vg1x4<string mnemonic, bits<7> op,
5935+
MatrixOperand matrix_op,
5936+
SDPatternOperator intrinsic> {
5937+
def NAME : sme2_dot_mla_add_sub_array_vg4_multi<op, matrix_op, ZZZZ_b_mul_r, mnemonic>,
5938+
SMEPseudo2Instr<NAME, 1> {
5939+
let Uses=[FPMR, FPCR];
5940+
}
5941+
5942+
def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
5943+
(!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZZZ_b_mul_r:$Zn, ZZZZ_b_mul_r:$Zm), 0>;
5944+
5945+
def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, sme_elm_idx0_7, ZZZZ_b_mul_r, SMEMatrixArray>;
5946+
5947+
def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<NAME, intrinsic, sme_elm_idx0_7, nxv16i8, tileslice16>;
5948+
}

llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,116 @@ define void @test_fdot32_1x4_indexed(i32 %slice.0,
5656
ret void
5757
}
5858

59+
define void @test_fdot32_1x2_single(i32 %slice.0,
60+
; CHECK-LABEL: test_fdot32_1x2_single:
61+
; CHECK: mov w8, w0
62+
; CHECK: fdot za.s[w8, 7, vgx2], { z0.b, z1.b }, z2.b
63+
; CHECK: ret
64+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
65+
<vscale x 16 x i8> %zm) #0 {
66+
%slice = add i32 %slice.0, 7
67+
call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x2(i32 %slice,
68+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
69+
<vscale x 16 x i8> %zm)
70+
ret void
71+
}
72+
73+
define void @test_fdot32_1x4_single(i32 %slice.0,
74+
; CHECK-LABEL: test_fdot32_1x4_single:
75+
; CHECK: mov w8, w0
76+
; CHECK: fdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b
77+
; CHECK: ret
78+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
79+
<vscale x 16 x i8> %zm) #0 {
80+
%slice = add i32 %slice.0, 7
81+
call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x4(i32 %slice,
82+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
83+
<vscale x 16 x i8> %zm)
84+
ret void
85+
}
86+
87+
define void @test_fdot32_1x2_multi(i32 %slice.0,
88+
; CHECK-LABEL: test_fdot32_1x2_multi:
89+
; CHECK: mov w8, w0
90+
; CHECK: fdot za.s[w8, 7, vgx2], { z0.b, z1.b }, { z2.b, z3.b }
91+
; CHECK: ret
92+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
93+
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
94+
%slice = add i32 %slice.0, 7
95+
call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x2(i32 %slice,
96+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
97+
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
98+
ret void
99+
}
100+
101+
define void @test_fdot32_1x4_multi(i32 %slice.0,
102+
; CHECK-LABEL: test_fdot32_1x4_multi:
103+
; CHECK: mov w8, w0
104+
; CHECK: fdot za.s[w8, 7, vgx4], { z0.b - z3.b }, { z4.b - z7.b }
105+
; CHECK: ret
106+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
107+
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) #0 {
108+
%slice = add i32 %slice.0, 7
109+
call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x4(i32 %slice,
110+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
111+
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4)
112+
ret void
113+
}
114+
115+
define void @test_fdot16_1x2_single(i32 %slice.0,
116+
; CHECK-LABEL: test_fdot16_1x2_single:
117+
; CHECK: mov w8, w0
118+
; CHECK: fdot za.h[w8, 7, vgx2], { z0.b, z1.b }, z2.b
119+
; CHECK: ret
120+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
121+
<vscale x 16 x i8> %zm) #0 {
122+
%slice = add i32 %slice.0, 7
123+
call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x2(i32 %slice,
124+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
125+
<vscale x 16 x i8> %zm)
126+
ret void
127+
}
128+
129+
define void @test_fdot16_1x4_single(i32 %slice.0,
130+
; CHECK-LABEL: test_fdot16_1x4_single:
131+
; CHECK: mov w8, w0
132+
; CHECK: fdot za.h[w8, 7, vgx4], { z0.b - z3.b }, z4.b
133+
; CHECK: ret
134+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
135+
<vscale x 16 x i8> %zm) #0 {
136+
%slice = add i32 %slice.0, 7
137+
call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x4(i32 %slice,
138+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
139+
<vscale x 16 x i8> %zm)
140+
ret void
141+
}
142+
143+
define void @test_fdot16_1x2_multi(i32 %slice.0,
144+
; CHECK-LABEL: test_fdot16_1x2_multi:
145+
; CHECK: mov w8, w0
146+
; CHECK: fdot za.h[w8, 7, vgx2], { z0.b, z1.b }, { z2.b, z3.b }
147+
; CHECK: ret
148+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
149+
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
150+
%slice = add i32 %slice.0, 7
151+
call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x2(i32 %slice,
152+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
153+
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
154+
ret void
155+
}
156+
157+
define void @test_fdot16_1x4_multi(i32 %slice.0,
158+
; CHECK-LABEL: test_fdot16_1x4_multi:
159+
; CHECK: mov w8, w0
160+
; CHECK: fdot za.h[w8, 7, vgx4], { z0.b - z3.b }, { z4.b - z7.b }
161+
; CHECK: ret
162+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
163+
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) #0 {
164+
%slice = add i32 %slice.0, 7
165+
call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x4(i32 %slice,
166+
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
167+
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4)
168+
ret void
169+
}
170+
59171
attributes #0 = { "target-features" = "+sme,+sme-f8f32,+sme-f8f16" }

0 commit comments

Comments
 (0)