Skip to content

Commit b0763a4

Browse files
authored
[AArch64] Implement intrinsics for FP8 FCVT/FCVTN/BFCVT (#118025)
This patch implements the following intrinsics: Convert to packed 8-bit floating-point format. ``` c // Variants are also available for: _mf8[_bf16_x2] and _mf8[_f32_x4] svmfloat8_t svcvt_mf8[_f16_x2]_fpm(svfloat16x2_t zn, fpm_t fpm) __arm_streaming; ``` Convert to interleaved 8-bit floating-point format. ``` c svmfloat8_t svcvtn_mf8[_f32_x4]_fpm(svfloat32x4_t zn, fpm_t fpm) __arm_streaming; ``` In accordance with ARM-software/acle#323. Co-authored-by: Marin Lukac [email protected] Co-authored-by: Caroline Concatto [email protected]
1 parent 9cdb7d2 commit b0763a4

File tree

7 files changed

+166
-8
lines changed

7 files changed

+166
-8
lines changed

clang/include/clang/Basic/arm_sve.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2436,6 +2436,12 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in {
24362436
// Convert from FP8 to deinterleaved half-precision/BFloat16 multi-vector
24372437
def SVF1CVTL_X2 : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>;
24382438
def SVF2CVTL_X2 : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>;
2439+
2440+
// Convert from single/half/bfloat multivector to FP8
2441+
def SVFCVT_X2 : Inst<"svcvt_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvt_x2", [IsStreaming, SetsFPMR], []>;
2442+
def SVFCVT_X4 : Inst<"svcvt_mf8[_{d}_x4]_fpm", "~4>", "f", MergeNone, "aarch64_sve_fp8_cvt_x4", [IsOverloadNone, IsStreaming, SetsFPMR], []>;
2443+
// interleaved
2444+
def SVFCVTN_X4 : Inst<"svcvtn_mf8[_{d}_x4]_fpm", "~4>", "f", MergeNone, "aarch64_sve_fp8_cvtn_x4", [IsOverloadNone, IsStreaming, SetsFPMR], []>;
24392445
}
24402446

24412447
let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {

clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,70 @@
1616
#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
1717
#endif
1818

19+
// CHECK-LABEL: @test_cvt_f16_x2(
20+
// CHECK-NEXT: entry:
21+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
22+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x2.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]])
23+
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
24+
//
25+
// CPP-CHECK-LABEL: @_Z15test_cvt_f16_x213svfloat16x2_tm(
26+
// CPP-CHECK-NEXT: entry:
27+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
28+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x2.nxv8f16(<vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]])
29+
// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
30+
//
31+
svmfloat8_t test_cvt_f16_x2(svfloat16x2_t zn, fpm_t fpmr) __arm_streaming {
32+
return SVE_ACLE_FUNC(svcvt_mf8,_f16_x2,_fpm)(zn, fpmr);
33+
}
34+
35+
// CHECK-LABEL: @test_cvt_f32_x4(
36+
// CHECK-NEXT: entry:
37+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
38+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x4(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
39+
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
40+
//
41+
// CPP-CHECK-LABEL: @_Z15test_cvt_f32_x413svfloat32x4_tm(
42+
// CPP-CHECK-NEXT: entry:
43+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
44+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x4(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
45+
// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
46+
//
47+
svmfloat8_t test_cvt_f32_x4(svfloat32x4_t zn, fpm_t fpmr) __arm_streaming {
48+
return SVE_ACLE_FUNC(svcvt_mf8,_f32_x4,_fpm)(zn, fpmr);
49+
}
50+
51+
// CHECK-LABEL: @test_cvtn_f32_x4(
52+
// CHECK-NEXT: entry:
53+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
54+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.x4(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
55+
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
56+
//
57+
// CPP-CHECK-LABEL: @_Z16test_cvtn_f32_x413svfloat32x4_tm(
58+
// CPP-CHECK-NEXT: entry:
59+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
60+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.x4(<vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZN_COERCE2:%.*]], <vscale x 4 x float> [[ZN_COERCE3:%.*]])
61+
// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
62+
//
63+
svmfloat8_t test_cvtn_f32_x4(svfloat32x4_t zn, fpm_t fpmr) __arm_streaming {
64+
return SVE_ACLE_FUNC(svcvtn_mf8,_f32_x4,_fpm)(zn, fpmr);
65+
}
66+
67+
// CHECK-LABEL: @test_cvt_bf16_x2(
68+
// CHECK-NEXT: entry:
69+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
70+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]])
71+
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
72+
//
73+
// CPP-CHECK-LABEL: @_Z16test_cvt_bf16_x214svbfloat16x2_tm(
74+
// CPP-CHECK-NEXT: entry:
75+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
76+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x2.nxv8bf16(<vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]])
77+
// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
78+
//
79+
svmfloat8_t test_cvt_bf16_x2(svbfloat16x2_t zn, fpm_t fpmr) __arm_streaming {
80+
return SVE_ACLE_FUNC(svcvt_mf8,_bf16_x2,_fpm)(zn, fpmr);
81+
}
82+
1983
// CHECK-LABEL: @test_cvt1_f16_x2(
2084
// CHECK-NEXT: entry:
2185
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])

clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
#include <arm_sve.h>
66

77

8-
void test_features_sme2_fp8(svmfloat8_t zn, fpm_t fpmr) __arm_streaming {
8+
void test_features_sme2_fp8(svmfloat8_t zn, svfloat16x2_t znf16, svbfloat16x2_t znbf16,
9+
svfloat32x4_t znf32, fpm_t fpmr) __arm_streaming {
910
// expected-error@+1 {{'svcvtl1_f16_mf8_x2_fpm' needs target feature sme,sme2,fp8}}
1011
svcvtl1_f16_mf8_x2_fpm(zn, fpmr);
1112
// expected-error@+1 {{'svcvtl2_f16_mf8_x2_fpm' needs target feature sme,sme2,fp8}}
@@ -23,4 +24,13 @@ void test_features_sme2_fp8(svmfloat8_t zn, fpm_t fpmr) __arm_streaming {
2324
svcvt1_bf16_mf8_x2_fpm(zn, fpmr);
2425
// expected-error@+1 {{'svcvt2_bf16_mf8_x2_fpm' needs target feature sme,sme2,fp8}}
2526
svcvt2_bf16_mf8_x2_fpm(zn, fpmr);
27+
28+
// expected-error@+1 {{'svcvt_mf8_f16_x2_fpm' needs target feature sme,sme2,fp8}}
29+
svcvt_mf8_f16_x2_fpm(znf16, fpmr);
30+
// expected-error@+1 {{'svcvt_mf8_bf16_x2_fpm' needs target feature sme,sme2,fp8}}
31+
svcvt_mf8_bf16_x2_fpm(znbf16, fpmr);
32+
// expected-error@+1 {{'svcvt_mf8_f32_x4_fpm' needs target feature sme,sme2,fp8}}
33+
svcvt_mf8_f32_x4_fpm(znf32, fpmr);
34+
// expected-error@+1 {{'svcvtn_mf8_f32_x4_fpm' needs target feature sme,sme2,fp8}}
35+
svcvtn_mf8_f32_x4_fpm(znf32, fpmr);
2636
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3812,6 +3812,7 @@ let TargetPrefix = "aarch64" in {
38123812
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
38133813
LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
38143814
[IntrNoMem]>;
3815+
38153816
}
38163817

38173818
// SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
@@ -3876,6 +3877,11 @@ let TargetPrefix = "aarch64" in {
38763877
[llvm_nxv16i8_ty],
38773878
[IntrReadMem, IntrInaccessibleMemOnly]>;
38783879

3880+
class SME2_FP8_CVT_Single_X4_Intrinsic
3881+
: DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
3882+
[llvm_nxv4f32_ty, llvm_nxv4f32_ty, llvm_nxv4f32_ty, llvm_nxv4f32_ty],
3883+
[IntrReadMem, IntrInaccessibleMemOnly]>;
3884+
38793885
class SME_FP8_OuterProduct_Intrinsic
38803886
: DefaultAttrsIntrinsic<[],
38813887
[llvm_i32_ty,
@@ -3894,6 +3900,17 @@ let TargetPrefix = "aarch64" in {
38943900
def int_aarch64_sve_fp8_cvtl1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
38953901
def int_aarch64_sve_fp8_cvtl2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
38963902

3903+
//
3904+
// CVT to FP8 from half-precision/BFloat16/single-precision multi-vector
3905+
//
3906+
def int_aarch64_sve_fp8_cvt_x2
3907+
: DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
3908+
[llvm_anyvector_ty, LLVMMatchType<0>],
3909+
[IntrReadMem, IntrInaccessibleMemOnly]>;
3910+
3911+
def int_aarch64_sve_fp8_cvt_x4 : SME2_FP8_CVT_Single_X4_Intrinsic;
3912+
def int_aarch64_sve_fp8_cvtn_x4 : SME2_FP8_CVT_Single_X4_Intrinsic;
3913+
38973914
// FP8 outer product
38983915
def int_aarch64_sme_fp8_fmopa_za16 : SME_FP8_OuterProduct_Intrinsic;
38993916
def int_aarch64_sme_fp8_fmopa_za32 : SME_FP8_OuterProduct_Intrinsic;

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -954,10 +954,10 @@ defm F2CVTL_2ZZ_BtoH : sme2p1_fp8_cvt_vector_vg2_single<"f2cvtl", 0b10, 0b1>;
954954
defm BF2CVT_2ZZ_BtoH : sme2p1_fp8_cvt_vector_vg2_single<"bf2cvt", 0b11, 0b0>;
955955
defm BF2CVTL_2ZZ_BtoH : sme2p1_fp8_cvt_vector_vg2_single<"bf2cvtl", 0b11, 0b1>;
956956

957-
defm FCVT_Z2Z_HtoB : sme2_fp8_cvt_vg2_single<"fcvt", 0b0>;
958-
defm BFCVT_Z2Z_HtoB : sme2_fp8_cvt_vg2_single<"bfcvt", 0b1>;
959-
defm FCVT_Z4Z_StoB : sme2_fp8_cvt_vg4_single<"fcvt", 0b0>;
960-
defm FCVTN_Z4Z_StoB : sme2_fp8_cvt_vg4_single<"fcvtn", 0b1>;
957+
defm FCVT_Z2Z_HtoB : sme2_fp8_cvt_vg2_single<"fcvt", 0b0, nxv8f16, int_aarch64_sve_fp8_cvt_x2>;
958+
defm BFCVT_Z2Z_HtoB : sme2_fp8_cvt_vg2_single<"bfcvt", 0b1, nxv8bf16, int_aarch64_sve_fp8_cvt_x2>;
959+
defm FCVT_Z4Z_StoB : sme2_fp8_cvt_vg4_single<"fcvt", 0b0, int_aarch64_sve_fp8_cvt_x4>;
960+
defm FCVTN_Z4Z_StoB : sme2_fp8_cvt_vg4_single<"fcvtn", 0b1, int_aarch64_sve_fp8_cvtn_x4>;
961961

962962
defm FSCALE_2ZZ : sme2_fp_sve_destructive_vector_vg2_single<"fscale", 0b0011000>;
963963
defm FSCALE_4ZZ : sme2_fp_sve_destructive_vector_vg4_single<"fscale", 0b0011000>;

llvm/lib/Target/AArch64/SMEInstrFormats.td

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2398,10 +2398,14 @@ multiclass sme2_cvt_vg2_single<string mnemonic, bits<5> op, ValueType out_vt,
23982398
}
23992399

24002400
// SME2 multi-vec FP8 down convert two registers
2401-
multiclass sme2_fp8_cvt_vg2_single<string mnemonic, bit op> {
2401+
multiclass sme2_fp8_cvt_vg2_single<string mnemonic, bit op, ValueType in_vt, SDPatternOperator intrinsic> {
24022402
def NAME : sme2_cvt_vg2_single<mnemonic, {op, 0b1000}, ZPR8, ZZ_h_mul_r>{
2403+
let mayLoad = 1;
2404+
let mayStore = 0;
24032405
let Uses = [FPMR, FPCR];
24042406
}
2407+
def : Pat<(nxv16i8 (intrinsic in_vt:$Zn1, in_vt:$Zn2)),
2408+
(!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2Mul2, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1))>;
24052409
}
24062410

24072411
class sme2_cvt_unpk_vector_vg2<bits<2>sz, bits<3> op, bit u, RegisterOperand first_ty,
@@ -2467,8 +2471,13 @@ multiclass sme2_int_cvt_vg4_single<string mnemonic, bits<3> op, SDPatternOperato
24672471
}
24682472

24692473
//SME2 multi-vec FP8 down convert four registers
2470-
multiclass sme2_fp8_cvt_vg4_single<string mnemonic, bit N> {
2471-
def _NAME : sme2_cvt_vg4_single<0b0, {0b00, N}, 0b0100, ZPR8, ZZZZ_s_mul_r, mnemonic>;
2474+
multiclass sme2_fp8_cvt_vg4_single<string mnemonic, bit N, SDPatternOperator intrinsic> {
2475+
def NAME : sme2_cvt_vg4_single<0b0, {0b00, N}, 0b0100, ZPR8, ZZZZ_s_mul_r, mnemonic> {
2476+
let mayLoad = 1;
2477+
let mayStore = 0;
2478+
let Uses = [FPMR, FPCR];
2479+
}
2480+
def : SME2_Cvt_VG4_Pat<NAME, intrinsic, nxv16i8, nxv4f32>;
24722481
}
24732482

24742483
class sme2_unpk_vector_vg4<bits<2>sz, bit u, RegisterOperand first_ty,

llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,58 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
22
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+fp8 -verify-machineinstrs -force-streaming < %s | FileCheck %s
33

4+
; FCVT / FCVTN / BFCVT
5+
6+
define <vscale x 16 x i8> @fcvt_x2(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1) {
7+
; CHECK-LABEL: fcvt_x2:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
10+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
11+
; CHECK-NEXT: fcvt z0.b, { z0.h, z1.h }
12+
; CHECK-NEXT: ret
13+
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x2.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1)
14+
ret <vscale x 16 x i8> %res
15+
}
16+
17+
define <vscale x 16 x i8> @fcvt_x4(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
18+
; CHECK-LABEL: fcvt_x4:
19+
; CHECK: // %bb.0:
20+
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
21+
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
22+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
23+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
24+
; CHECK-NEXT: fcvt z0.b, { z0.s - z3.s }
25+
; CHECK-NEXT: ret
26+
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x4(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
27+
<vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
28+
ret <vscale x 16 x i8> %res
29+
}
30+
31+
define <vscale x 16 x i8> @fcvtn(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
32+
; CHECK-LABEL: fcvtn:
33+
; CHECK: // %bb.0:
34+
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
35+
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
36+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
37+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
38+
; CHECK-NEXT: fcvtn z0.b, { z0.s - z3.s }
39+
; CHECK-NEXT: ret
40+
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.x4(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
41+
<vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
42+
ret <vscale x 16 x i8> %res
43+
}
44+
45+
define <vscale x 16 x i8> @bfcvt(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1) {
46+
; CHECK-LABEL: bfcvt:
47+
; CHECK: // %bb.0:
48+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
49+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
50+
; CHECK-NEXT: bfcvt z0.b, { z0.h, z1.h }
51+
; CHECK-NEXT: ret
52+
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x2.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1)
53+
ret <vscale x 16 x i8> %res
54+
}
55+
456
; F1CVT / F2CVT
557

658
define { <vscale x 8 x half>, <vscale x 8 x half> } @f1cvt(<vscale x 16 x i8> %zm) {

0 commit comments

Comments
 (0)