Skip to content

Commit 44967fc

Browse files
AMDGPU: Simplify f16 to i64 custom lowering
Range that f16 can represent fits into i32. Lower as f16->i32->i64 instead of f16->f32->i64 since f32->i64 has long expansion. Differential Revision: https://reviews.llvm.org/D84166
1 parent 3a34194 commit 44967fc

File tree

3 files changed

+46
-24
lines changed

3 files changed

+46
-24
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2702,14 +2702,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
27022702
// TODO: Factor out code common with LowerFP_TO_UINT.
27032703

27042704
EVT SrcVT = Src.getValueType();
2705-
if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2705+
if (SrcVT == MVT::f16 ||
2706+
(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
27062707
SDLoc DL(Op);
27072708

2708-
SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2709-
SDValue FpToInt32 =
2710-
DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2711-
2712-
return FpToInt32;
2709+
SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
2710+
return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32);
27132711
}
27142712

27152713
if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
@@ -2725,14 +2723,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
27252723
// TODO: Factor out code common with LowerFP_TO_SINT.
27262724

27272725
EVT SrcVT = Src.getValueType();
2728-
if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2726+
if (SrcVT == MVT::f16 ||
2727+
(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
27292728
SDLoc DL(Op);
27302729

2731-
SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2732-
SDValue FpToInt32 =
2733-
DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2734-
2735-
return FpToInt32;
2730+
SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
2731+
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32);
27362732
}
27372733

27382734
if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)

llvm/test/CodeGen/AMDGPU/fptosi.f16.ll

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,11 @@ entry:
3737
; test checks code generated for 'i64 = fp_to_sint f32'.
3838

3939
; GCN-LABEL: {{^}}fptosi_f16_to_i64
40-
; GCN: buffer_load_ushort
41-
; GCN: v_cvt_f32_f16_e32
40+
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
41+
; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
42+
; GCN: v_cvt_i32_f32_e32 v[[R_I64_Low:[0-9]+]], v[[A_F32]]
43+
; GCN: v_ashrrev_i32_e32 v[[R_I64_High:[0-9]+]], 31, v[[R_I64_Low]]
44+
; GCN: buffer_store_dwordx2 v{{\[}}[[R_I64_Low]]{{\:}}[[R_I64_High]]{{\]}}
4245
; GCN: s_endpgm
4346
define amdgpu_kernel void @fptosi_f16_to_i64(
4447
i64 addrspace(1)* %r,
@@ -104,10 +107,21 @@ entry:
104107
; test checks code generated for 'i64 = fp_to_sint f32'.
105108

106109
; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64
107-
; GCN: buffer_load_dword
108-
; GCN: v_cvt_f32_f16_e32
109-
; SI: v_cvt_f32_f16_e32
110-
; VI: v_cvt_f32_f16_sdwa
110+
; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]]
111+
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
112+
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
113+
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
114+
; SI: v_cvt_i32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
115+
; SI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]]
116+
; SI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
117+
; SI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]]
118+
; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
119+
; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
120+
; VI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
121+
; VI: v_cvt_i32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
122+
; VI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]]
123+
; VI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]]
124+
; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}}
111125
; GCN: s_endpgm
112126
define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
113127
<2 x i64> addrspace(1)* %r,

llvm/test/CodeGen/AMDGPU/fptoui.f16.ll

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,11 @@ entry:
3838
; test checks code generated for 'i64 = fp_to_uint f32'.
3939

4040
; GCN-LABEL: {{^}}fptoui_f16_to_i64
41-
; GCN: buffer_load_ushort
42-
; GCN: v_cvt_f32_f16_e32
41+
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
42+
; GCN: v_mov_b32_e32 v[[R_I64_High:[0-9]+]], 0
43+
; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
44+
; GCN: v_cvt_u32_f32_e32 v[[R_I64_Low:[0-9]+]], v[[A_F32]]
45+
; GCN: buffer_store_dwordx2 v{{\[}}[[R_I64_Low]]{{\:}}[[R_I64_High]]{{\]}}
4346
; GCN: s_endpgm
4447
define amdgpu_kernel void @fptoui_f16_to_i64(
4548
i64 addrspace(1)* %r,
@@ -104,10 +107,19 @@ entry:
104107
; test checks code generated for 'i64 = fp_to_uint f32'.
105108

106109
; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64
107-
; GCN: buffer_load_dword
108-
; GCN: v_cvt_f32_f16_e32
109-
; SI: v_cvt_f32_f16_e32
110-
; VI: v_cvt_f32_f16_sdwa
110+
; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]]
111+
; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], 0
112+
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
113+
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
114+
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
115+
; SI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
116+
; SI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
117+
; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
118+
; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
119+
; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
120+
; VI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
121+
; GCN: v_mov_b32_e32 v[[R_I64_0_High:[0-9]+]], 0
122+
; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}}
111123
; GCN: s_endpgm
112124
define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
113125
<2 x i64> addrspace(1)* %r,

0 commit comments

Comments
 (0)