Skip to content

[NVPTX] Add Bulk Copy Prefetch Intrinsics #123226

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,34 @@ it must be a multiple of 16.
For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_.

'``llvm.nvvm.cp.async.bulk.prefetch.L2``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Syntax:
"""""""

.. code-block:: llvm

declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 %flag_ch)

Overview:
"""""""""

The '``@llvm.nvvm.cp.async.bulk.prefetch.L2``' intrinsic
corresponds to the ``cp.async.bulk.prefetch.L2.*`` family
of PTX instructions. These instructions initiate an asynchronous
prefetch of bulk data from global memory to the L2 cache.
The 32-bit operand ``%size`` specifies the amount of memory to be
prefetched in terms of bytes and it must be a multiple of 16.

* The last argument to these intrinsics is boolean flag indicating
support for cache_hint. These flag argument must be compile-time
constant. When set, it indicates a valid cache_hint (``i64 %ch``)
and generates the ``.L2::cache_hint`` variant of the PTX instruction.

For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch>`_.

'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
11 changes: 11 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -5033,4 +5033,15 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
ImmArg<ArgIndex<4>>]>;

// Intrinsics for Bulk Copy Prefetch L2
def int_nvvm_cp_async_bulk_prefetch_L2
: DefaultAttrsIntrinsic<[],
[llvm_global_ptr_ty, // src_gmem_ptr
llvm_i32_ty, // copy_size
llvm_i64_ty, // cache_hint
llvm_i1_ty], // Flag for cache_hint
[IntrConvergent, IntrArgMemOnly,
NoCapture<ArgIndex<0>>, ReadOnly<ArgIndex<0>>,
ImmArg<ArgIndex<3>>]>;

} // let TargetPrefix = "nvvm"
22 changes: 22 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3105,6 +3105,25 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) {
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}

void NVPTXDAGToDAGISel::SelectCpAsyncBulkPrefetchL2(SDNode *N) {
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
// src, size, cache_hint, cache_hint_flag
// NumOperands = {Chain, IID} + {Actual intrinsic args}
// = {2} + {4}
size_t NumOps = N->getNumOperands();
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
size_t NumArgs = IsCacheHint ? 3 : 2; // src, size, cache_hint

SDLoc DL(N);
SmallVector<SDValue, 4> Ops(N->ops().slice(2, NumArgs));
Ops.push_back(N->getOperand(0)); // Chain operand

unsigned Opcode = IsCacheHint
? NVPTX::CP_ASYNC_BULK_PREFETCH_CH
: NVPTX::CP_ASYNC_BULK_PREFETCH;
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}

bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
unsigned IID = N->getConstantOperandVal(1);
using TMARedTy = llvm::nvvm::TMAReductionOp;
Expand All @@ -3118,6 +3137,9 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global:
SelectCpAsyncBulkS2G(N);
return true;
case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
SelectCpAsyncBulkPrefetchL2(N);
return true;
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
void SelectI128toV2I64(SDNode *N);
void SelectCpAsyncBulkG2S(SDNode *N);
void SelectCpAsyncBulkS2G(SDNode *N);
void SelectCpAsyncBulkPrefetchL2(SDNode *N);
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,18 @@ multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER<NVPTXRegClass rc> {
defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int64Regs>;
defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int32Regs>;

//------------------------------
// Bulk Copy Prefetch Functions
//------------------------------
def CP_ASYNC_BULK_PREFETCH : NVPTXInst<(outs),
(ins Int64Regs:$src, Int32Regs:$size),
"cp.async.bulk.prefetch.L2.global [$src], $size;", []>,
Requires<[hasPTX<80>, hasSM<90>]>;

def CP_ASYNC_BULK_PREFETCH_CH : NVPTXInst<(outs),
(ins Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
"cp.async.bulk.prefetch.L2.global.L2::cache_hint [$src], $size, $ch;", []>,
Requires<[hasPTX<80>, hasSM<90>]>;
//-------------------------------------
// TMA Async Bulk Tensor Copy Functions
//-------------------------------------
Expand Down
19 changes: 19 additions & 0 deletions llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ target triple = "nvptx64-nvidia-cuda"
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i16, i64, i1, i1)
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1), ptr addrspace(3), i32, i64, i1)
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(3), i32)
declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1), i32, i64, i1)

define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i16 %mc, i64 %ch) {
; CHECK-PTX64-LABEL: cp_async_bulk_g2s(
Expand Down Expand Up @@ -116,3 +117,21 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src, i32 %size)
ret void
}

define void @cp_async_bulk_prefetch(ptr addrspace(1) %src, i32 %size, i64 %ch) {
; CHECK-PTX64-LABEL: cp_async_bulk_prefetch(
; CHECK-PTX64: {
; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
; CHECK-PTX64-NEXT: .reg .b64 %rd<3>;
; CHECK-PTX64-EMPTY:
; CHECK-PTX64-NEXT: // %bb.0:
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_prefetch_param_0];
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_prefetch_param_1];
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_prefetch_param_2];
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd2;
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global [%rd1], %r1;
; CHECK-PTX64-NEXT: ret;
tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1)
tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 0, i1 0)
ret void
}
Loading