Skip to content

Commit e3b6059

Browse files
committed
[X86][SSE] combineX86ShufflesConstants - early out for zeroable vectors (PR45443)
Shuffle combining can insert zero byte sized elements into the shuffle mask, which combineX86ShufflesConstants will attempt to fold without taking into account whether the byte-sized type is legal (e.g. AVX512F only targets). If we have a full-zeroable vector then we should just return a zero version of the root type, otherwise if the type isn't valid we should bail. Fixes PR45443
1 parent 6b3353e commit e3b6059

File tree

2 files changed

+28
-1
lines changed

2 files changed

+28
-1
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34749,6 +34749,7 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
3474934749
return SDValue();
3475034750

3475134751
// Shuffle the constant bits according to the mask.
34752+
SDLoc DL(Root);
3475234753
APInt UndefElts(NumMaskElts, 0);
3475334754
APInt ZeroElts(NumMaskElts, 0);
3475434755
APInt ConstantElts(NumMaskElts, 0);
@@ -34786,6 +34787,10 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
3478634787
}
3478734788
assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
3478834789

34790+
// Attempt to create a zero vector.
34791+
if ((UndefElts | ZeroElts).isAllOnesValue())
34792+
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
34793+
3478934794
// Create the constant data.
3479034795
MVT MaskSVT;
3479134796
if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
@@ -34794,8 +34799,9 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
3479434799
MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
3479534800

3479634801
MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
34802+
if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
34803+
return SDValue();
3479734804

34798-
SDLoc DL(Root);
3479934805
SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
3480034806
return DAG.getBitcast(VT, CstOp);
3480134807
}

llvm/test/CodeGen/X86/pr45443.ll

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=i686-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
3+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
4+
5+
define <16 x float> @PR45443() {
6+
; CHECK-LABEL: PR45443:
7+
; CHECK: # %bb.0: # %bb
8+
; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0
9+
; CHECK-NEXT: ret{{[l|q]}}
10+
bb:
11+
%tmp = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> <i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040>, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>)
12+
%tmp4 = tail call fast <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> <float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000>, <16 x float> undef)
13+
%tmp5 = icmp ult <16 x i32> %tmp, <i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216>
14+
%tmp6 = and <16 x i32> %tmp, <i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215, i32 16777215>
15+
%tmp7 = icmp ne <16 x i32> %tmp6, zeroinitializer
16+
%tmp8 = and <16 x i1> %tmp7, %tmp5
17+
%tmp9 = select fast <16 x i1> %tmp8, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>, <16 x float> %tmp4
18+
ret <16 x float> %tmp9
19+
}
20+
declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>)
21+
declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>)

0 commit comments

Comments
 (0)