Skip to content

Commit 547bfda

Browse files
authored
[AArch64] Improve bcvtn2 and remove aarch64_neon_bfcvt intrinsics (llvm#120363)
This started out as trying to combine bf16 fpround to BFCVT2 instructions, but ended up removing the aarch64.neon.nfcvt intrinsics in favour of generating fpround instructions directly. This simplifies the patterns and can lead to other optimizations. The BFCVT2 instruction is adjusted to makes sure the types are valid, and a bfcvt2 is now generated in more place. The old intrinsics are auto-upgraded to fptrunc instructions too.
1 parent c22364a commit 547bfda

File tree

10 files changed

+190
-165
lines changed

10 files changed

+190
-165
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO
259259
def OP_VCVT_F32_BF16_HI
260260
: Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;
261261

262-
def OP_VCVT_BF16_F32_LO_A64
263-
: Op<(call "__a64_vcvtq_low_bf16", $p0)>;
264-
def OP_VCVT_BF16_F32_A64
265-
: Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;
266-
267262
def OP_VCVT_BF16_F32_A32
268263
: Op<(call "__a32_vcvt_bf16", $p0)>;
269264

@@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard =
20612056
}
20622057

20632058
let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
2064-
def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
2065-
def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
2059+
def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">;
20662060
def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
2067-
def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>;
2061+
def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">;
20682062

20692063
def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
20702064
def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7307,7 +7307,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
73077307
};
73087308

73097309
static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
7310-
NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
73117310
NEONMAP0(splat_lane_v),
73127311
NEONMAP0(splat_laneq_v),
73137312
NEONMAP0(splatq_lane_v),
@@ -7407,7 +7406,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
74077406
NEONMAP0(vcvtq_f16_s16),
74087407
NEONMAP0(vcvtq_f16_u16),
74097408
NEONMAP0(vcvtq_f32_v),
7410-
NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
7409+
NEONMAP0(vcvtq_high_bf16_f32),
7410+
NEONMAP0(vcvtq_low_bf16_f32),
74117411
NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
74127412
NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
74137413
NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
@@ -7616,7 +7616,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
76167616
NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
76177617
NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
76187618
NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
7619-
NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
7619+
NEONMAP0(vcvth_bf16_f32),
76207620
NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
76217621
NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
76227622
NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
@@ -12083,6 +12083,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1208312083
return ConstantInt::get(Builder.getInt32Ty(), 0);
1208412084
}
1208512085

12086+
if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
12087+
return Builder.CreateFPTrunc(
12088+
Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
12089+
Builder.getFloatTy()),
12090+
Builder.getBFloatTy());
12091+
1208612092
// Handle MSVC intrinsics before argument evaluation to prevent double
1208712093
// evaluation.
1208812094
if (std::optional<MSVCIntrin> MsvcIntId =
@@ -12808,6 +12814,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1280812814
return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
1280912815
"vgetq_lane");
1281012816
}
12817+
case NEON::BI__builtin_neon_vcvt_bf16_f32: {
12818+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12819+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12820+
return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
12821+
}
12822+
case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
12823+
SmallVector<int, 16> ConcatMask(8);
12824+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
12825+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12826+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12827+
llvm::Value *Trunc =
12828+
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
12829+
return Builder.CreateShuffleVector(
12830+
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
12831+
}
12832+
case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
12833+
SmallVector<int, 16> ConcatMask(8);
12834+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
12835+
SmallVector<int, 16> LoMask(4);
12836+
std::iota(LoMask.begin(), LoMask.end(), 0);
12837+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12838+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12839+
llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
12840+
llvm::Value *Inactive = Builder.CreateShuffleVector(
12841+
Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
12842+
llvm::Value *Trunc =
12843+
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
12844+
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
12845+
}
1281112846

1281212847
case clang::AArch64::BI_InterlockedAdd:
1281312848
case clang::AArch64::BI_InterlockedAdd64: {

clang/test/CodeGen/arm-bf16-convert-intrinsics.c

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
223223
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
224224
// CHECK-A64-NEXT: entry:
225225
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
226-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
227-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
228-
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
229-
// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
226+
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
227+
// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP1]]
230228
//
231229
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
232230
// CHECK-A32-HARDFP-NEXT: entry:
@@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
263261
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
264262
// CHECK-A64-NEXT: entry:
265263
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
266-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
267-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
268-
// CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
264+
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
265+
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
266+
// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]]
269267
//
270268
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
271269
// CHECK-A32-HARDFP-NEXT: entry:
@@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
323321
// CHECK-A64-NEXT: entry:
324322
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
325323
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
326-
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
327-
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
328-
// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
324+
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
325+
// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
326+
// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
327+
// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
329328
//
330329
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
331330
// CHECK-A32-HARDFP-NEXT: entry:
@@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
404403

405404
// CHECK-A64-LABEL: @test_vcvth_bf16_f32(
406405
// CHECK-A64-NEXT: entry:
407-
// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
408-
// CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]]
406+
// CHECK-A64-NEXT: [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat
407+
// CHECK-A64-NEXT: ret bfloat [[TMP0]]
409408
//
410409
// CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
411410
// CHECK-A32-HARDFP-NEXT: entry:

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
538538
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
539539
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
540540

541-
542-
// v8.6-A Bfloat Intrinsics
543-
def int_aarch64_neon_bfcvt
544-
: DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
545-
def int_aarch64_neon_bfcvtn
546-
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
547-
def int_aarch64_neon_bfcvtn2
548-
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
549-
[llvm_v8bf16_ty, llvm_v4f32_ty],
550-
[IntrNoMem]>;
551-
552541
// v8.2-A FP16 Fused Multiply-Add Long
553542
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
554543
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include "llvm/Support/Regex.h"
4646
#include "llvm/TargetParser/Triple.h"
4747
#include <cstring>
48+
#include <numeric>
4849

4950
using namespace llvm;
5051

@@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
828829
return true;
829830
}
830831
}
832+
833+
// Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc.
834+
if (Name.starts_with("bfcvt")) {
835+
NewFn = nullptr;
836+
return true;
837+
}
838+
831839
return false; // No other 'aarch64.neon.*'.
832840
}
833841
if (Name.consume_front("sve.")) {
@@ -4028,31 +4036,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
40284036

40294037
static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
40304038
Function *F, IRBuilder<> &Builder) {
4031-
Intrinsic::ID NewID =
4032-
StringSwitch<Intrinsic::ID>(Name)
4033-
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
4034-
.Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
4035-
.Default(Intrinsic::not_intrinsic);
4036-
if (NewID == Intrinsic::not_intrinsic)
4037-
llvm_unreachable("Unhandled Intrinsic!");
4038-
4039-
SmallVector<Value *, 3> Args(CI->args());
4040-
4041-
// The original intrinsics incorrectly used a predicate based on the smallest
4042-
// element type rather than the largest.
4043-
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
4044-
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
4045-
4046-
if (Args[1]->getType() != BadPredTy)
4047-
llvm_unreachable("Unexpected predicate type!");
4048-
4049-
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
4050-
BadPredTy, Args[1]);
4051-
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
4052-
GoodPredTy, Args[1]);
4053-
4054-
return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
4055-
CI->getName());
4039+
if (Name.starts_with("neon.bfcvt")) {
4040+
if (Name.starts_with("neon.bfcvtn2")) {
4041+
SmallVector<int, 32> LoMask(4);
4042+
std::iota(LoMask.begin(), LoMask.end(), 0);
4043+
SmallVector<int, 32> ConcatMask(8);
4044+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
4045+
Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask);
4046+
Value *Trunc =
4047+
Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType());
4048+
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
4049+
} else if (Name.starts_with("neon.bfcvtn")) {
4050+
SmallVector<int, 32> ConcatMask(8);
4051+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
4052+
Type *V4BF16 =
4053+
FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4);
4054+
Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16);
4055+
dbgs() << "Trunc: " << *Trunc << "\n";
4056+
return Builder.CreateShuffleVector(
4057+
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
4058+
} else {
4059+
return Builder.CreateFPTrunc(CI->getOperand(0),
4060+
Type::getBFloatTy(F->getContext()));
4061+
}
4062+
} else if (Name.starts_with("sve.fcvt")) {
4063+
Intrinsic::ID NewID =
4064+
StringSwitch<Intrinsic::ID>(Name)
4065+
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
4066+
.Case("sve.fcvtnt.bf16f32",
4067+
Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
4068+
.Default(Intrinsic::not_intrinsic);
4069+
if (NewID == Intrinsic::not_intrinsic)
4070+
llvm_unreachable("Unhandled Intrinsic!");
4071+
4072+
SmallVector<Value *, 3> Args(CI->args());
4073+
4074+
// The original intrinsics incorrectly used a predicate based on the
4075+
// smallest element type rather than the largest.
4076+
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
4077+
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
4078+
4079+
if (Args[1]->getType() != BadPredTy)
4080+
llvm_unreachable("Unexpected predicate type!");
4081+
4082+
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
4083+
BadPredTy, Args[1]);
4084+
Args[1] = Builder.CreateIntrinsic(
4085+
Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]);
4086+
4087+
return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
4088+
CI->getName());
4089+
}
4090+
4091+
llvm_unreachable("Unhandled Intrinsic!");
40564092
}
40574093

40584094
static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9053,22 +9053,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
90539053

90549054
let mayRaiseFPException = 1, Uses = [FPCR] in
90559055
class SIMD_BFCVTN
9056-
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
9056+
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
90579057
"bfcvtn", ".4h", ".4s",
9058-
[(set (v8bf16 V128:$Rd),
9059-
(int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
9058+
[(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;
90609059

90619060
let mayRaiseFPException = 1, Uses = [FPCR] in
90629061
class SIMD_BFCVTN2
90639062
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
9064-
"bfcvtn2", ".8h", ".4s",
9065-
[(set (v8bf16 V128:$dst),
9066-
(int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
9063+
"bfcvtn2", ".8h", ".4s", []>;
90679064

90689065
let mayRaiseFPException = 1, Uses = [FPCR] in
90699066
class BF16ToSinglePrecision<string asm>
90709067
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
9071-
[(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
9068+
[(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
90729069
Sched<[WriteFCvt]> {
90739070
bits<5> Rd;
90749071
bits<5> Rn;

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,8 +1454,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
14541454
def BFCVTN : SIMD_BFCVTN;
14551455
def BFCVTN2 : SIMD_BFCVTN2;
14561456

1457-
def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
1458-
(EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
1457+
def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
1458+
(BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;
14591459

14601460
// Vector-scalar BFDOT:
14611461
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
@@ -1477,8 +1477,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
14771477

14781478
let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
14791479
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
1480-
// Round FP32 to BF16.
1481-
def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
14821480
}
14831481

14841482
// ARMv8.6A AArch64 matrix multiplication
@@ -10412,9 +10410,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
1041210410
let Predicates = [HasBF16] in
1041310411
def : Pat<(InOp (v8bf16 V128:$Rn)),
1041410412
(v8bf16 (BFCVTN2
10415-
(v8bf16 (BFCVTN
10416-
(v4f32 (OutInst
10417-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
10413+
(INSERT_SUBREG (IMPLICIT_DEF),
10414+
(v4bf16 (BFCVTN
10415+
(v4f32 (OutInst
10416+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
10417+
dsub),
1041810418
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;
1041910419

1042010420
let Predicates = [HasNoBF16] in
@@ -10449,10 +10449,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
1044910449
let Predicates = [HasBF16] in
1045010450
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
1045110451
(v8bf16 (BFCVTN2
10452-
(v8bf16 (BFCVTN
10453-
(v4f32 (OutInst
10454-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
10455-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
10452+
(INSERT_SUBREG (IMPLICIT_DEF),
10453+
(v4bf16 (BFCVTN
10454+
(v4f32 (OutInst
10455+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
10456+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
10457+
dsub),
1045610458
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
1045710459
(v4f32 (SHLLv8i16 V128:$Rm))))))>;
1045810460

llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s
22

3+
; This test acts to test the old neon.bfcvt intrinsics, which are now
4+
; autoupgraded to fptrunc operations.
5+
36
declare bfloat @llvm.aarch64.neon.bfcvt(float)
47
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
58
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)

0 commit comments

Comments
 (0)