Skip to content

Commit 9c89b40

Browse files
authored
[AArch64] Implement intrinsics for FMLAL/FMLALL (single) (#119568)
Multi-vector 8-bit floating-point multiply-add long (single) ```c // Only if __ARM_FEATURE_SME_F8F16 != 0 void svmla[_single]_za16[_mf8]_vg2x1_fpm(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); void svmla[_single]_za16[_mf8]_vg2x2_fpm(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); void svmla[_single]_za16[_mf8]_vg2x4_fpm(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); // Only if __ARM_FEATURE_SME_F8F32 != 0 void svmla[_single]_za32[_mf8]_vg4x1_fpm(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); void svmla[_single]_za32[_mf8]_vg4x2_fpm(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); void svmla[_single]_za32[_mf8]_vg4x4_fpm(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za"); ``` In accordance with ARM-software/acle#323. Co-authored-by: Momchil Velikov [email protected]
1 parent 2a7ed2c commit 9c89b40

File tree

7 files changed

+360
-89
lines changed

7 files changed

+360
-89
lines changed

clang/include/clang/Basic/arm_sme.td

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -860,24 +860,38 @@ let SMETargetGuard = "sme-f8f32" in {
860860
def SVMOPA_FP8_ZA32 : Inst<"svmopa_za32[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za32",
861861
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>]>;
862862
// FMLALL (indexed)
863-
def SVMLA_FP8_ZA32_VG4x1 : Inst<"svmla_lane_za32[_mf8]_vg4x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x1",
864-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
865-
def SVMLA_FP8_ZA32_VG4x2 : Inst<"svmla_lane_za32[_mf8]_vg4x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x2",
866-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
867-
def SVMLA_FP8_ZA16_VG4x4 : Inst<"svmla_lane_za32[_mf8]_vg4x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x4",
868-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
863+
def SVMLA_FP8_LANE_ZA32_VG4x1 : Inst<"svmla_lane_za32[_mf8]_vg4x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x1",
864+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
865+
def SVMLA_FP8_LANE_ZA32_VG4x2 : Inst<"svmla_lane_za32[_mf8]_vg4x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x2",
866+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
867+
def SVMLA_FP8_LANE_ZA16_VG4x4 : Inst<"svmla_lane_za32[_mf8]_vg4x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x4",
868+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
869+
// FMLALL (single)
870+
def SVMLA_FP8_SINGLE_ZA32_VG4x1 : Inst<"svmla[_single]_za32[_mf8]_vg4x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x1",
871+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
872+
def SVMLA_FP8_SINGLE_ZA32_VG4x2 : Inst<"svmla[_single]_za32[_mf8]_vg4x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x2",
873+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
874+
def SVMLA_FP8_SINGLE_ZA32_VG4x4 : Inst<"svmla[_single]_za32[_mf8]_vg4x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x4",
875+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
869876
}
870877

871878
let SMETargetGuard = "sme-f8f16" in {
872879
def SVMOPA_FP8_ZA16 : Inst<"svmopa_za16[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za16",
873880
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_1>]>;
874881
// FMLAL (indexed)
875-
def SVMLA_FP8_ZA16_VG2x1 : Inst<"svmla_lane_za16[_mf8]_vg2x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x1",
876-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
877-
def SVMLA_FP8_ZA16_VG2x2 : Inst<"svmla_lane_za16[_mf8]_vg2x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x2",
878-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
879-
def SVMLA_FP8_ZA16_VG2x4 : Inst<"svmla_lane_za16[_mf8]_vg2x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x4",
880-
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
882+
def SVMLA_FP8_LANE_ZA16_VG2x1 : Inst<"svmla_lane_za16[_mf8]_vg2x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x1",
883+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
884+
def SVMLA_FP8_LANE_ZA16_VG2x2 : Inst<"svmla_lane_za16[_mf8]_vg2x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x2",
885+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
886+
def SVMLA_FP8_LANE_ZA16_VG2x4 : Inst<"svmla_lane_za16[_mf8]_vg2x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x4",
887+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
888+
// FMLAL (single)
889+
def SVMLA_FP8_SINGLE_ZA16_VG2x1 : Inst<"svmla[_single]_za16[_mf8]_vg2x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x1",
890+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
891+
def SVMLA_FP8_SINGLE_ZA16_VG2x2 : Inst<"svmla[_single]_za16[_mf8]_vg2x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x2",
892+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
893+
def SVMLA_FP8_SINGLE_ZA16_VG2x4 : Inst<"svmla[_single]_za16[_mf8]_vg2x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x4",
894+
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
881895
}
882896

883897
} // let SVETargetGuard = InvalidMode

clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c

Lines changed: 121 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
#include <arm_sme.h>
1212

1313
#ifdef SME_OVERLOADED_FORMS
14-
#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
14+
#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5
1515
#else
16-
#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
16+
#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5
1717
#endif
1818

1919
// FMLAL (indexed)
@@ -33,7 +33,7 @@
3333
// CPP-CHECK-NEXT: ret void
3434
//
3535
void test_svmla_lane_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
36-
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x1_fpm)(slice, zn, zm, 0, fpm);
36+
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x1_fpm,,)(slice, zn, zm, 0, fpm);
3737
}
3838

3939
// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x2(
@@ -51,7 +51,7 @@ void test_svmla_lane_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm,
5151
// CPP-CHECK-NEXT: ret void
5252
//
5353
void test_svmla_lane_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
54-
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x2_fpm)(slice, zn, zm, 15, fpm);
54+
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x2_fpm,,)(slice, zn, zm, 15, fpm);
5555
}
5656

5757
// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x4(
@@ -69,7 +69,7 @@ void test_svmla_lane_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm
6969
// CPP-CHECK-NEXT: ret void
7070
//
7171
void test_svmla_lane_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
72-
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x4_fpm)(slice, zn, zm, 7, fpm);
72+
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x4_fpm,,)(slice, zn, zm, 7, fpm);
7373
}
7474

7575
// FMLALL (indexed)
@@ -89,7 +89,7 @@ void test_svmla_lane_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm
8989
// CPP-CHECK-NEXT: ret void
9090
//
9191
void test_svmla_lane_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
92-
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x1_fpm)(slice, zn, zm, 0, fpm);
92+
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x1_fpm,,)(slice, zn, zm, 0, fpm);
9393
}
9494

9595
// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x2(
@@ -107,7 +107,7 @@ void test_svmla_lane_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm,
107107
// CPP-CHECK-NEXT: ret void
108108
//
109109
void test_svmla_lane_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
110-
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x2_fpm)(slice, zn, zm, 15, fpm);
110+
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x2_fpm,,)(slice, zn, zm, 15, fpm);
111111
}
112112

113113
// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x4(
@@ -125,5 +125,117 @@ void test_svmla_lane_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm
125125
// CPP-CHECK-NEXT: ret void
126126
//
127127
void test_svmla_lane_za32_vg4x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
128-
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x4_fpm)(slice, zn, zm, 7, fpm);
129-
}
128+
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x4_fpm,,)(slice, zn, zm, 7, fpm);
129+
}
130+
131+
// FMLAL (single)
132+
133+
// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x1(
134+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
135+
// CHECK-NEXT: [[ENTRY:.*:]]
136+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
137+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
138+
// CHECK-NEXT: ret void
139+
//
140+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x1ju13__SVMfloat8_tS_m(
141+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
142+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
143+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
144+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
145+
// CPP-CHECK-NEXT: ret void
146+
//
147+
void test_svmla_single_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
148+
SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x1_fpm)(slice, zn, zm, fpm);
149+
}
150+
151+
// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x2(
152+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
153+
// CHECK-NEXT: [[ENTRY:.*:]]
154+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
155+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
156+
// CHECK-NEXT: ret void
157+
//
158+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x2j13svmfloat8x2_tu13__SVMfloat8_tm(
159+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
160+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
161+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
162+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
163+
// CPP-CHECK-NEXT: ret void
164+
//
165+
void test_svmla_single_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
166+
SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x2_fpm)(slice, zn, zm, fpm);
167+
}
168+
169+
// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x4(
170+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
171+
// CHECK-NEXT: [[ENTRY:.*:]]
172+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
173+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
174+
// CHECK-NEXT: ret void
175+
//
176+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x4j13svmfloat8x4_tu13__SVMfloat8_tm(
177+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
178+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
179+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
180+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
181+
// CPP-CHECK-NEXT: ret void
182+
//
183+
void test_svmla_single_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
184+
SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x4_fpm)(slice, zn, zm, fpm);
185+
}
186+
187+
// FMLALL (single)
188+
189+
// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x1(
190+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
191+
// CHECK-NEXT: [[ENTRY:.*:]]
192+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
193+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
194+
// CHECK-NEXT: ret void
195+
//
196+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x1ju13__SVMfloat8_tS_m(
197+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
198+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
199+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
200+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
201+
// CPP-CHECK-NEXT: ret void
202+
//
203+
void test_svmla_single_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
204+
SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x1_fpm)(slice, zn, zm, fpm);
205+
}
206+
207+
// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x2(
208+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
209+
// CHECK-NEXT: [[ENTRY:.*:]]
210+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
211+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
212+
// CHECK-NEXT: ret void
213+
//
214+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x2j13svmfloat8x2_tu13__SVMfloat8_tm(
215+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
216+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
217+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
218+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
219+
// CPP-CHECK-NEXT: ret void
220+
//
221+
void test_svmla_single_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
222+
SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x2_fpm)(slice, zn, zm, fpm);
223+
}
224+
225+
// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x4(
226+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
227+
// CHECK-NEXT: [[ENTRY:.*:]]
228+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
229+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
230+
// CHECK-NEXT: ret void
231+
//
232+
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x4j13svmfloat8x4_tu13__SVMfloat8_tm(
233+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
234+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
235+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
236+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
237+
// CPP-CHECK-NEXT: ret void
238+
//
239+
void test_svmla_single_za32_vg4x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
240+
SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x4_fpm)(slice, zn, zm, fpm);
241+
}

clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_mla.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,22 @@ void test_svmla(uint32_t slice, svmfloat8_t zn, svmfloat8x2_t znx2, svmfloat8x4_
2323

2424
// expected-error@+1 {{'svmla_lane_za32_mf8_vg4x4_fpm' needs target feature sme,sme-f8f32}}
2525
svmla_lane_za32_mf8_vg4x4_fpm(slice, znx4, zn, 0, fpmr);
26+
27+
// expected-error@+1 {{'svmla_single_za16_mf8_vg2x1_fpm' needs target feature sme,sme-f8f16}}
28+
svmla_single_za16_mf8_vg2x1_fpm(slice, zn, zn, fpmr);
29+
30+
// expected-error@+1 {{'svmla_single_za16_mf8_vg2x2_fpm' needs target feature sme,sme-f8f16}}
31+
svmla_single_za16_mf8_vg2x2_fpm(slice, znx2, zn, fpmr);
32+
33+
// expected-error@+1 {{'svmla_single_za16_mf8_vg2x4_fpm' needs target feature sme,sme-f8f16}}
34+
svmla_single_za16_mf8_vg2x4_fpm(slice, znx4, zn, fpmr);
35+
36+
// expected-error@+1 {{'svmla_single_za32_mf8_vg4x1_fpm' needs target feature sme,sme-f8f32}}
37+
svmla_single_za32_mf8_vg4x1_fpm(slice, zn, zn, fpmr);
38+
39+
// expected-error@+1 {{'svmla_single_za32_mf8_vg4x2_fpm' needs target feature sme,sme-f8f32}}
40+
svmla_single_za32_mf8_vg4x2_fpm(slice, znx2, zn, fpmr);
41+
42+
// expected-error@+1 {{'svmla_single_za32_mf8_vg4x4_fpm' needs target feature sme,sme-f8f32}}
43+
svmla_single_za32_mf8_vg4x4_fpm(slice, znx4, zn, fpmr);
2644
}

0 commit comments

Comments
 (0)