Skip to content

File tree

4 files changed

+156
-0
lines changed

4 files changed

+156
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,96 @@ these intrinsics is a boolean flag, with the same functionality as described in
733733
For more information, refer PTX ISA
734734
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor>`_.
735735

736+
Warp Group Intrinsics
737+
---------------------
738+
739+
'``llvm.nvvm.wgmma.fence.sync.aligned``'
740+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
741+
742+
Syntax:
743+
"""""""
744+
745+
.. code-block:: llvm
746+
747+
declare void @llvm.nvvm.wgmma.fence.sync.aligned()
748+
749+
Overview:
750+
"""""""""
751+
752+
The '``@llvm.nvvm.wgmma.fence.sync.aligned``' intrinsic generates the
753+
``wgmma.fence.sync.aligned`` PTX instruction, which establishes an ordering
754+
between prior accesses to any warpgroup registers and subsequent accesses to
755+
the same registers by a ``wgmma.mma_async`` instruction.
756+
757+
The ``wgmma.fence`` instruction must be issued by all warps of the warpgroup in
758+
the following locations:
759+
760+
* Before the first ``wgmma.mma_async`` operation in a warpgroup.
761+
* Between a register access by a thread in the warpgroup and any
762+
``wgmma.mma_async`` instruction that accesses the same registers, except when
763+
these are accumulator register accesses across multiple ``wgmma.mma_async``
764+
instructions of the same shape in which case an ordering guarantee is
765+
provided by default.
766+
767+
For more information, refer PTX ISA
768+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-fence>`_.
769+
770+
'``llvm.nvvm.wgmma.commit_group.sync.aligned``'
771+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
772+
773+
Syntax:
774+
"""""""
775+
776+
.. code-block:: llvm
777+
778+
declare void @llvm.nvvm.wgmma.commit_group.sync.aligned()
779+
780+
Overview:
781+
"""""""""
782+
783+
The '``@llvm.nvvm.wgmma.commit_group.sync.aligned``' intrinsic generates the
784+
``wgmma.commit_group.sync.aligned`` PTX instruction, which creates a new
785+
wgmma-group per warpgroup and batches all prior ``wgmma.mma_async``
786+
instructions initiated by the executing warp but not committed to any
787+
wgmma-group into the new wgmma-group. If there are no uncommitted ``wgmma
788+
mma_async`` instructions then, ``wgmma.commit_group`` results in an empty
789+
wgmma-group.
790+
791+
An executing thread can wait for the completion of all ``wgmma.mma_async``
792+
operations in a wgmma-group by using ``wgmma.wait_group``.
793+
794+
For more information, refer PTX ISA
795+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group>`_.
796+
797+
'``llvm.nvvm.wgmma.wait_group.sync.aligned``'
798+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
799+
800+
Syntax:
801+
"""""""
802+
803+
.. code-block:: llvm
804+
805+
declare void @llvm.nvvm.wgmma.wait_group.sync.aligned(i64 immarg N)
806+
807+
Overview:
808+
"""""""""
809+
810+
The '``@llvm.nvvm.wgmma.wait_group.sync.aligned``' intrinsic generates the
811+
``wgmma.commit_group.sync.aligned N`` PTX instruction, which will cause the
812+
executing thread to wait until only ``N`` or fewer of the most recent
813+
wgmma-groups are pending and all the prior wgmma-groups committed by the
814+
executing threads are complete. For example, when ``N`` is 0, the executing
815+
thread waits on all the prior wgmma-groups to complete. Operand ``N`` is an
816+
integer constant.
817+
818+
Accessing the accumulator register or the input register containing the
819+
fragments of matrix A of a ``wgmma.mma_async`` instruction without first
820+
performing a ``wgmma.wait_group`` instruction that waits on a wgmma-group
821+
including that ``wgmma.mma_async`` instruction is undefined behavior.
822+
823+
For more information, refer PTX ISA
824+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-wait-group>`_.
825+
736826
Other Intrinsics
737827
----------------
738828

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4805,6 +4805,21 @@ def int_nvvm_redux_sync_or : ClangBuiltin<"__nvvm_redux_sync_or">,
48054805
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
48064806
[IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
48074807

4808+
//
4809+
// WGMMA fence instructions
4810+
//
4811+
// wgmma.fence.sync.aligned;
4812+
def int_nvvm_wgmma_fence_sync_aligned
4813+
: Intrinsic<[], [], [IntrConvergent]>;
4814+
4815+
// wgmma.commit_group.sync.aligned;
4816+
def int_nvvm_wgmma_commit_group_sync_aligned
4817+
: Intrinsic<[], [], [IntrConvergent], "llvm.nvvm.wgmma.commit_group.sync.aligned">;
4818+
4819+
// wgmma.wait_group.sync.aligned N;
4820+
def int_nvvm_wgmma_wait_group_sync_aligned
4821+
: Intrinsic<[], [llvm_i64_ty], [IntrConvergent, ImmArg<ArgIndex<0>>], "llvm.nvvm.wgmma.wait_group.sync.aligned">;
4822+
48084823
//
48094824
// WMMA instructions
48104825
//

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7484,4 +7484,18 @@ defm INT_SET_MAXNREG_DEC : SET_MAXNREG<"dec", int_nvvm_setmaxnreg_dec_sync_align
74847484

74857485
} // isConvergent
74867486

7487+
//
7488+
// WGMMA fence instructions
7489+
//
7490+
let isConvergent = true in {
7491+
def INT_NVVM_WGMMA_FENCE_SYNC_ALIGNED : NVPTXInst<(outs), (ins), "wgmma.fence.sync.aligned;",
7492+
[(int_nvvm_wgmma_fence_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>;
7493+
7494+
def INT_NVVM_WGMMA_COMMIT_GROUP_SYNC_ALIGNED : NVPTXInst<(outs), (ins), "wgmma.commit_group.sync.aligned;",
7495+
[(int_nvvm_wgmma_commit_group_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>;
7496+
7497+
def INT_NVVM_WGMMA_WAIT_GROUP_SYNC_ALIGNED : NVPTXInst<(outs), (ins i64imm:$n), "wgmma.wait_group.sync.aligned \t$n;",
7498+
[(int_nvvm_wgmma_wait_group_sync_aligned timm:$n)]>, Requires<[hasSM90a, hasPTX<80>]>;
7499+
} // isConvergent = true
7500+
74877501
def INT_EXIT : NVPTXInst<(outs), (ins), "exit;", [(int_nvvm_exit)]>;
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80 | FileCheck %s
2+
; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80 | %ptxas-verify -arch=sm_90a %}
3+
4+
target triple = "nvptx64-nvidia-cuda"
5+
6+
declare void @llvm.nvvm.wgmma.fence.sync.aligned()
7+
8+
define void @test_wgmma_fence_sync_aligned() {
9+
; CHECK-LABEL: test_wgmma_fence_sync_aligned(
10+
; CHECK: // %bb.0:
11+
; CHECK-NEXT: wgmma.fence.sync.aligned;
12+
; CHECK-NEXT: ret;
13+
call void @llvm.nvvm.wgmma.fence.sync.aligned()
14+
ret void
15+
}
16+
17+
declare void @llvm.nvvm.wgmma.commit_group.sync.aligned()
18+
19+
define void @test_wgmma_commit_group_sync_aligned() {
20+
; CHECK-LABEL: test_wgmma_commit_group_sync_aligned(
21+
; CHECK: // %bb.0:
22+
; CHECK-NEXT: wgmma.commit_group.sync.aligned;
23+
; CHECK-NEXT: ret;
24+
call void @llvm.nvvm.wgmma.commit_group.sync.aligned()
25+
ret void
26+
}
27+
28+
declare void @llvm.nvvm.wgmma.wait_group.sync.aligned(i64)
29+
30+
define void @test_wgmma_wait_group_sync_aligned() {
31+
; CHECK-LABEL: test_wgmma_wait_group_sync_aligned(
32+
; CHECK: // %bb.0:
33+
; CHECK-NEXT: wgmma.wait_group.sync.aligned 10;
34+
; CHECK-NEXT: ret;
35+
call void @llvm.nvvm.wgmma.wait_group.sync.aligned(i64 10)
36+
ret void
37+
}

0 commit comments

Comments
 (0)