diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index b19632535b3e1..313e84f3722a9 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -733,6 +733,96 @@ these intrinsics is a boolean flag, with the same functionality as described in For more information, refer PTX ISA ``_. +Warp Group Intrinsics +--------------------- + +'``llvm.nvvm.wgmma.fence.sync.aligned``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.wgmma.fence.sync.aligned() + +Overview: +""""""""" + +The '``@llvm.nvvm.wgmma.fence.sync.aligned``' intrinsic generates the +``wgmma.fence.sync.aligned`` PTX instruction, which establishes an ordering +between prior accesses to any warpgroup registers and subsequent accesses to +the same registers by a ``wgmma.mma_async`` instruction. + +The ``wgmma.fence`` instruction must be issued by all warps of the warpgroup in +the following locations: + +* Before the first ``wgmma.mma_async`` operation in a warpgroup. +* Between a register access by a thread in the warpgroup and any + ``wgmma.mma_async`` instruction that accesses the same registers, except when + these are accumulator register accesses across multiple ``wgmma.mma_async`` + instructions of the same shape in which case an ordering guarantee is + provided by default. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.wgmma.commit_group.sync.aligned``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() + +Overview: +""""""""" + +The '``@llvm.nvvm.wgmma.commit_group.sync.aligned``' intrinsic generates the +``wgmma.commit_group.sync.aligned`` PTX instruction, which creates a new +wgmma-group per warpgroup and batches all prior ``wgmma.mma_async`` +instructions initiated by the executing warp but not committed to any +wgmma-group into the new wgmma-group. If there are no uncommitted ``wgmma +mma_async`` instructions then, ``wgmma.commit_group`` results in an empty +wgmma-group. + +An executing thread can wait for the completion of all ``wgmma.mma_async`` +operations in a wgmma-group by using ``wgmma.wait_group``. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.wgmma.wait_group.sync.aligned``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.wgmma.wait_group.sync.aligned(i64 immarg N) + +Overview: +""""""""" + +The '``@llvm.nvvm.wgmma.wait_group.sync.aligned``' intrinsic generates the +``wgmma.commit_group.sync.aligned N`` PTX instruction, which will cause the +executing thread to wait until only ``N`` or fewer of the most recent +wgmma-groups are pending and all the prior wgmma-groups committed by the +executing threads are complete. For example, when ``N`` is 0, the executing +thread waits on all the prior wgmma-groups to complete. Operand ``N`` is an +integer constant. + +Accessing the accumulator register or the input register containing the +fragments of matrix A of a ``wgmma.mma_async`` instruction without first +performing a ``wgmma.wait_group`` instruction that waits on a wgmma-group +including that ``wgmma.mma_async`` instruction is undefined behavior. + +For more information, refer PTX ISA +``_. + Other Intrinsics ---------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 9834dbb70d4c1..fd07d131ce15b 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4805,6 +4805,21 @@ def int_nvvm_redux_sync_or : ClangBuiltin<"__nvvm_redux_sync_or">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>; +// +// WGMMA fence instructions +// +// wgmma.fence.sync.aligned; +def int_nvvm_wgmma_fence_sync_aligned + : Intrinsic<[], [], [IntrConvergent]>; + +// wgmma.commit_group.sync.aligned; +def int_nvvm_wgmma_commit_group_sync_aligned + : Intrinsic<[], [], [IntrConvergent], "llvm.nvvm.wgmma.commit_group.sync.aligned">; + +// wgmma.wait_group.sync.aligned N; +def int_nvvm_wgmma_wait_group_sync_aligned + : Intrinsic<[], [llvm_i64_ty], [IntrConvergent, ImmArg>], "llvm.nvvm.wgmma.wait_group.sync.aligned">; + // // WMMA instructions // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 256161d5d79c7..33fc2922900c7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7484,4 +7484,18 @@ defm INT_SET_MAXNREG_DEC : SET_MAXNREG<"dec", int_nvvm_setmaxnreg_dec_sync_align } // isConvergent +// +// WGMMA fence instructions +// +let isConvergent = true in { +def INT_NVVM_WGMMA_FENCE_SYNC_ALIGNED : NVPTXInst<(outs), (ins), "wgmma.fence.sync.aligned;", + [(int_nvvm_wgmma_fence_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>; + +def INT_NVVM_WGMMA_COMMIT_GROUP_SYNC_ALIGNED : NVPTXInst<(outs), (ins), "wgmma.commit_group.sync.aligned;", + [(int_nvvm_wgmma_commit_group_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>; + +def INT_NVVM_WGMMA_WAIT_GROUP_SYNC_ALIGNED : NVPTXInst<(outs), (ins i64imm:$n), "wgmma.wait_group.sync.aligned \t$n;", + [(int_nvvm_wgmma_wait_group_sync_aligned timm:$n)]>, Requires<[hasSM90a, hasPTX<80>]>; +} // isConvergent = true + def INT_EXIT : NVPTXInst<(outs), (ins), "exit;", [(int_nvvm_exit)]>; diff --git a/llvm/test/CodeGen/NVPTX/wgmma-sm90a-fence.ll b/llvm/test/CodeGen/NVPTX/wgmma-sm90a-fence.ll new file mode 100644 index 0000000000000..59fe57b9b2c89 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/wgmma-sm90a-fence.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80 | FileCheck %s +; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80 | %ptxas-verify -arch=sm_90a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.wgmma.fence.sync.aligned() + +define void @test_wgmma_fence_sync_aligned() { +; CHECK-LABEL: test_wgmma_fence_sync_aligned( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: wgmma.fence.sync.aligned; +; CHECK-NEXT: ret; + call void @llvm.nvvm.wgmma.fence.sync.aligned() + ret void +} + +declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() + +define void @test_wgmma_commit_group_sync_aligned() { +; CHECK-LABEL: test_wgmma_commit_group_sync_aligned( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: wgmma.commit_group.sync.aligned; +; CHECK-NEXT: ret; + call void @llvm.nvvm.wgmma.commit_group.sync.aligned() + ret void +} + +declare void @llvm.nvvm.wgmma.wait_group.sync.aligned(i64) + +define void @test_wgmma_wait_group_sync_aligned() { +; CHECK-LABEL: test_wgmma_wait_group_sync_aligned( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: wgmma.wait_group.sync.aligned 10; +; CHECK-NEXT: ret; + call void @llvm.nvvm.wgmma.wait_group.sync.aligned(i64 10) + ret void +}