Skip to content

Commit fd949f3

Browse files
topperccuviper
authored andcommitted
[X86] Change precision control to FP80 during u64->fp32 conversion on Windows.
This is an alternative to D141074 to fix the problem by adjusting the precision control dynamically. Reviewed By: icedrocket Differential Revision: https://reviews.llvm.org/D142178 (cherry picked from commit 11fb09e)
1 parent 477e728 commit fd949f3

File tree

4 files changed

+145
-5
lines changed

4 files changed

+145
-5
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21616,15 +21616,25 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
2161621616
// Extend everything to 80 bits to force it to be done on x87.
2161721617
// TODO: Are there any fast-math-flags to propagate here?
2161821618
if (IsStrict) {
21619-
SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
21620-
{Chain, Fild, Fudge});
21619+
unsigned Opc = ISD::STRICT_FADD;
21620+
// Windows needs the precision control changed to 80bits around this add.
21621+
if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21622+
Opc = X86ISD::STRICT_FP80_ADD;
21623+
21624+
SDValue Add =
21625+
DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
2162121626
// STRICT_FP_ROUND can't handle equal types.
2162221627
if (DstVT == MVT::f80)
2162321628
return Add;
2162421629
return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
2162521630
{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
2162621631
}
21627-
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
21632+
unsigned Opc = ISD::FADD;
21633+
// Windows needs the precision control changed to 80bits around this add.
21634+
if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21635+
Opc = X86ISD::FP80_ADD;
21636+
21637+
SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
2162821638
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
2162921639
DAG.getIntPtrConstant(0, dl));
2163021640
}
@@ -33830,6 +33840,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
3383033840
NODE_NAME_CASE(AESENCWIDE256KL)
3383133841
NODE_NAME_CASE(AESDECWIDE256KL)
3383233842
NODE_NAME_CASE(TESTUI)
33843+
NODE_NAME_CASE(FP80_ADD)
33844+
NODE_NAME_CASE(STRICT_FP80_ADD)
3383333845
}
3383433846
return nullptr;
3383533847
#undef NODE_NAME_CASE
@@ -36340,6 +36352,69 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
3634036352
return BB;
3634136353
}
3634236354

36355+
case X86::FP80_ADDr:
36356+
case X86::FP80_ADDm32: {
36357+
// Change the floating point control register to use double extended
36358+
// precision when performing the addition.
36359+
int OrigCWFrameIdx =
36360+
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36361+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
36362+
OrigCWFrameIdx);
36363+
36364+
// Load the old value of the control word...
36365+
Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36366+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
36367+
OrigCWFrameIdx);
36368+
36369+
// OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
36370+
// precision.
36371+
Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36372+
BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
36373+
.addReg(OldCW, RegState::Kill)
36374+
.addImm(0x300);
36375+
36376+
// Extract to 16 bits.
36377+
Register NewCW16 =
36378+
MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36379+
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
36380+
.addReg(NewCW, RegState::Kill, X86::sub_16bit);
36381+
36382+
// Prepare memory for FLDCW.
36383+
int NewCWFrameIdx =
36384+
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36385+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
36386+
NewCWFrameIdx)
36387+
.addReg(NewCW16, RegState::Kill);
36388+
36389+
// Reload the modified control word now...
36390+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
36391+
NewCWFrameIdx);
36392+
36393+
// Do the addition.
36394+
if (MI.getOpcode() == X86::FP80_ADDr) {
36395+
BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
36396+
.add(MI.getOperand(0))
36397+
.add(MI.getOperand(1))
36398+
.add(MI.getOperand(2));
36399+
} else {
36400+
BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))
36401+
.add(MI.getOperand(0))
36402+
.add(MI.getOperand(1))
36403+
.add(MI.getOperand(2))
36404+
.add(MI.getOperand(3))
36405+
.add(MI.getOperand(4))
36406+
.add(MI.getOperand(5))
36407+
.add(MI.getOperand(6));
36408+
}
36409+
36410+
// Reload the original control word now.
36411+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
36412+
OrigCWFrameIdx);
36413+
36414+
MI.eraseFromParent(); // The pseudo instruction is gone now.
36415+
return BB;
36416+
}
36417+
3634336418
case X86::FP32_TO_INT16_IN_MEM:
3634436419
case X86::FP32_TO_INT32_IN_MEM:
3634536420
case X86::FP32_TO_INT64_IN_MEM:

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,9 @@ namespace llvm {
732732
// User level interrupts - testui
733733
TESTUI,
734734

735+
// Perform an FP80 add after changing precision control in FPCW.
736+
FP80_ADD,
737+
735738
/// X86 strict FP compare instructions.
736739
STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
737740
STRICT_FCMPS,
@@ -771,6 +774,9 @@ namespace llvm {
771774
STRICT_CVTPS2PH,
772775
STRICT_CVTPH2PS,
773776

777+
// Perform an FP80 add after changing precision control in FPCW.
778+
STRICT_FP80_ADD,
779+
774780
// WARNING: Only add nodes here if they are stric FP nodes. Non-memory and
775781
// non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
776782

llvm/lib/Target/X86/X86InstrFPStack.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
2626
def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
2727
def SDTX86CwdLoad : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
2828

29+
def X86fp80_add : SDNode<"X86ISD::FP80_ADD", SDTFPBinOp, [SDNPCommutative]>;
30+
def X86strict_fp80_add : SDNode<"X86ISD::STRICT_FP80_ADD", SDTFPBinOp,
31+
[SDNPHasChain,SDNPCommutative]>;
32+
def any_X86fp80_add : PatFrags<(ops node:$lhs, node:$rhs),
33+
[(X86strict_fp80_add node:$lhs, node:$rhs),
34+
(X86fp80_add node:$lhs, node:$rhs)]>;
35+
2936
def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
3037
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
3138
def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
@@ -141,6 +148,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [EFLAGS] in {
141148
[(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
142149
def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
143150
[(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
151+
152+
def FP80_ADDr : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
153+
[(set RFP80:$dst,
154+
(any_X86fp80_add RFP80:$src1, RFP80:$src2))]>;
155+
def FP80_ADDm32 : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2),
156+
[(set RFP80:$dst,
157+
(any_X86fp80_add RFP80:$src1,
158+
(f80 (extloadf32 addr:$src2))))]>;
144159
}
145160

146161
// All FP Stack operations are represented with four instructions here. The

llvm/test/CodeGen/X86/uint64-to-float.ll

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=i686-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
3-
; RUN: llc < %s -mtriple=x86_64-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
2+
; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
3+
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
4+
; RUN: llc < %s -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86-WIN
5+
; RUN: llc < %s -mtriple=x86_64-windows -mattr=+sse2 | FileCheck %s --check-prefix=X64-WIN
46

57
; Verify that we are using the efficient uitofp --> sitofp lowering illustrated
68
; by the compiler_rt implementation of __floatundisf.
@@ -42,6 +44,48 @@ define float @test(i64 %a) nounwind {
4244
; X64-NEXT: cvtsi2ss %rdi, %xmm0
4345
; X64-NEXT: addss %xmm0, %xmm0
4446
; X64-NEXT: retq
47+
;
48+
; X86-WIN-LABEL: test:
49+
; X86-WIN: # %bb.0: # %entry
50+
; X86-WIN-NEXT: pushl %ebp
51+
; X86-WIN-NEXT: movl %esp, %ebp
52+
; X86-WIN-NEXT: andl $-8, %esp
53+
; X86-WIN-NEXT: subl $24, %esp
54+
; X86-WIN-NEXT: movl 12(%ebp), %eax
55+
; X86-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
56+
; X86-WIN-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
57+
; X86-WIN-NEXT: shrl $31, %eax
58+
; X86-WIN-NEXT: fildll {{[0-9]+}}(%esp)
59+
; X86-WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
60+
; X86-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
61+
; X86-WIN-NEXT: orl $768, %ecx # imm = 0x300
62+
; X86-WIN-NEXT: movw %cx, {{[0-9]+}}(%esp)
63+
; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
64+
; X86-WIN-NEXT: fadds __real@5f80000000000000(,%eax,4)
65+
; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
66+
; X86-WIN-NEXT: fstps {{[0-9]+}}(%esp)
67+
; X86-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
68+
; X86-WIN-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
69+
; X86-WIN-NEXT: flds {{[0-9]+}}(%esp)
70+
; X86-WIN-NEXT: movl %ebp, %esp
71+
; X86-WIN-NEXT: popl %ebp
72+
; X86-WIN-NEXT: retl
73+
;
74+
; X64-WIN-LABEL: test:
75+
; X64-WIN: # %bb.0: # %entry
76+
; X64-WIN-NEXT: testq %rcx, %rcx
77+
; X64-WIN-NEXT: js .LBB0_1
78+
; X64-WIN-NEXT: # %bb.2: # %entry
79+
; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0
80+
; X64-WIN-NEXT: retq
81+
; X64-WIN-NEXT: .LBB0_1:
82+
; X64-WIN-NEXT: movq %rcx, %rax
83+
; X64-WIN-NEXT: shrq %rax
84+
; X64-WIN-NEXT: andl $1, %ecx
85+
; X64-WIN-NEXT: orq %rax, %rcx
86+
; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0
87+
; X64-WIN-NEXT: addss %xmm0, %xmm0
88+
; X64-WIN-NEXT: retq
4589
entry:
4690
%b = uitofp i64 %a to float
4791
ret float %b

0 commit comments

Comments
 (0)