-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[NVPTX] Add TMA bulk tensor copy intrinsics #96083
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Add TMA bulk tensor copy intrinsics #96083
Conversation
@llvm/pr-subscribers-backend-nvptx @llvm/pr-subscribers-llvm-ir Author: Durgadoss R (durga4github) ChangesThis patch adds NVVM intrinsics and NVPTX codeGen for:
TODO: Update documentation for these intrinsics in NVPTX guide. Patch is 74.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96083.diff 7 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 0a9139e0062ba..bd90d243b12f9 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -552,6 +552,22 @@ class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
[OpType, llvm_i32_ty, llvm_i32_ty]);
}
+class NVVM_INTRINSIC_RECORD<string intr> {
+ string record = !subst(".", "_", !subst("llvm.", "int_", intr));
+}
+
+class NVVM_CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_NAME<int dim> {
+ string intr = "llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem"
+ # "." # dim # "d";
+ string record = NVVM_INTRINSIC_RECORD<intr>.record;
+}
+
+class NVVM_CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_NAME<int dim> {
+ string intr = "llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem"
+ # "." # dim # "d";
+ string record = NVVM_INTRINSIC_RECORD<intr>.record;
+}
+
let TargetPrefix = "nvvm" in {
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -4822,4 +4838,42 @@ def int_nvvm_setmaxnreg_dec_sync_aligned_u32
def int_nvvm_exit : ClangBuiltin<"__nvvm_exit">,
Intrinsic<[], [], [IntrConvergent, IntrInaccessibleMemOnly, IntrNoReturn]>;
+// -------- llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem
+class NVVM_CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_INTR<int dim> :
+ DefaultAttrsIntrinsic<[],
+ !listconcat(
+ // flags, dst_smem_ptr, barrier_ptr, tensor_map_ptr
+ [llvm_i32_ty, llvm_shared_ptr_ty, llvm_shared_ptr_ty, llvm_ptr_ty],
+ !listsplat(llvm_i32_ty, dim), // tensor_dims
+ !if(!ge(dim, 3), !listsplat(llvm_i16_ty, !add(dim, -2)), []), // im2col
+ [llvm_i16_ty, llvm_i64_ty]), // cta_mask, cache_policy
+ [IntrConvergent, IntrArgMemOnly,
+ WriteOnly<ArgIndex<1>>, ReadOnly<ArgIndex<3>>,
+ NoCapture<ArgIndex<1>>, NoCapture<ArgIndex<2>>,
+ NoCapture<ArgIndex<3>>],
+ NVVM_CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_NAME<dim>.intr>;
+
+foreach dim = [1, 2, 3, 4, 5] in {
+ def NVVM_CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_NAME<dim>.record :
+ NVVM_CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_INTR<dim>;
+}
+
+// -------- llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem
+class NVVM_CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_INTR<int dim> :
+ DefaultAttrsIntrinsic<[],
+ !listconcat(
+ // flags, src_smem_ptr, tensor_map_ptr
+ [llvm_i32_ty, llvm_shared_ptr_ty, llvm_ptr_ty],
+ !listsplat(llvm_i32_ty, dim), // tensor_dims
+ [llvm_i64_ty]), // cache_policy
+ [IntrConvergent, IntrArgMemOnly,
+ ReadOnly<ArgIndex<1>>, WriteOnly<ArgIndex<2>>,
+ NoCapture<ArgIndex<1>>, NoCapture<ArgIndex<2>>],
+ NVVM_CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_NAME<dim>.intr>;
+
+foreach dim = [1, 2, 3, 4, 5] in {
+ def NVVM_CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_NAME<dim>.record :
+ NVVM_CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_INTR<dim>;
+}
+
} // let TargetPrefix = "nvvm"
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicFlags.h b/llvm/include/llvm/IR/NVVMIntrinsicFlags.h
new file mode 100644
index 0000000000000..a8273b8de5adf
--- /dev/null
+++ b/llvm/include/llvm/IR/NVVMIntrinsicFlags.h
@@ -0,0 +1,40 @@
+//===--- NVVMIntrinsicFlags.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the definitions of the enumerations and flags
+/// associated with NVVM Intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_NVVMINTRINSICFLAGS_H
+#define LLVM_SUPPORT_NVVMINTRINSICFLAGS_H
+
+#include <stdint.h>
+
+namespace llvm {
+namespace nvvm {
+
+enum class CpAsyncBulkTensorLoadMode {
+ TILE = 0,
+ IM2COL = 1,
+};
+
+typedef union {
+ int V;
+ struct {
+ unsigned CacheHint : 1;
+ unsigned MultiCast : 1;
+ unsigned LoadMode : 3; // CpAsyncBulkTensorLoadMode
+ unsigned reserved : 27;
+ } U;
+} CpAsyncBulkTensorFlags;
+
+} // namespace nvvm
+} // namespace llvm
+#endif // LLVM_SUPPORT_NVVMINTRINSICFLAGS_H
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 1e1cbb15e33d4..cbb51de88acbe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -18,6 +18,7 @@
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/NVVMIntrinsicFlags.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -160,6 +161,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (tryIntrinsicChain(N))
return;
break;
+ case ISD::INTRINSIC_VOID:
+ if (tryIntrinsicVoid(N))
+ return;
+ break;
case NVPTXISD::Tex1DFloatS32:
case NVPTXISD::Tex1DFloatFloat:
case NVPTXISD::Tex1DFloatFloatLevel:
@@ -3861,3 +3866,246 @@ unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
}
}
}
+
+static size_t GetCpAsyncBulkTensorDimFromIntrinsic(unsigned IID) {
+ switch (IID) {
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_1d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_1d:
+ return 1;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_2d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_2d:
+ return 2;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_3d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_3d:
+ return 3;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_4d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_4d:
+ return 4;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_5d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_5d:
+ return 5;
+ default:
+ llvm_unreachable(
+ "Invalid Tensor dim in nvvm_cp_async_bulk_tensor intrinsic");
+ }
+}
+
+#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, suffix) \
+ if (IsShared32) { \
+ return NVPTX:: \
+ CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix; \
+ } else { \
+ return NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix; \
+ }
+
+#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode) \
+ do { \
+ if (IsCacheHint) { \
+ CP_ASYNC_BULK_TENSOR_OPCODE(SMEM_TO_GMEM, dim, mode, _CH); \
+ } else { \
+ CP_ASYNC_BULK_TENSOR_OPCODE(SMEM_TO_GMEM, dim, mode, ); \
+ } \
+ } while (0)
+
+#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode) \
+ do { \
+ if (IsMultiCast && IsCacheHint) { \
+ CP_ASYNC_BULK_TENSOR_OPCODE(GMEM_TO_SMEM, dim, mode, _MC_CH); \
+ } else if (IsCacheHint) { \
+ CP_ASYNC_BULK_TENSOR_OPCODE(GMEM_TO_SMEM, dim, mode, _CH); \
+ } else if (IsMultiCast) { \
+ CP_ASYNC_BULK_TENSOR_OPCODE(GMEM_TO_SMEM, dim, mode, _MC); \
+ } else { \
+ CP_ASYNC_BULK_TENSOR_OPCODE(GMEM_TO_SMEM, dim, mode, ); \
+ } \
+ } while (0)
+
+static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
+ bool IsCacheHint, bool IsIm2Col) {
+ if (IsIm2Col) {
+ switch (Dim) {
+ case 3:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL);
+ case 4:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL);
+ case 5:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL);
+ default:
+ llvm_unreachable("Invalid Dimension in im2col mode for "
+ "GetCpAsyncBulkTensorS2GOpcode.");
+ }
+ } else {
+ switch (Dim) {
+ case 1:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE);
+ case 2:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE);
+ case 3:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE);
+ case 4:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE);
+ case 5:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE);
+ default:
+ llvm_unreachable(
+ "Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
+ }
+ }
+}
+
+static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
+ bool IsMultiCast,
+ bool IsCacheHint, bool IsIm2Col) {
+ if (IsIm2Col) {
+ switch (Dim) {
+ case 3:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL);
+ case 4:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL);
+ case 5:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL);
+ default:
+ llvm_unreachable("Invalid Dimension in im2col mode for "
+ "GetCpAsyncBulkTensorG2SOpcode.");
+ }
+ } else {
+ switch (Dim) {
+ case 1:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE);
+ case 2:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE);
+ case 3:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE);
+ case 4:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE);
+ case 5:
+ GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE);
+ default:
+ llvm_unreachable(
+ "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
+ }
+ }
+}
+
+void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2G(SDNode *N) {
+ unsigned int SharedPointerSize =
+ CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED);
+ bool IsShared32 = (SharedPointerSize == 32);
+
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ size_t NumDims = GetCpAsyncBulkTensorDimFromIntrinsic(IID);
+
+ ConstantSDNode *FlagsNode = cast<ConstantSDNode>(N->getOperand(2));
+ nvvm::CpAsyncBulkTensorFlags Flags;
+ Flags.V = static_cast<unsigned>(FlagsNode->getZExtValue());
+ bool IsCacheHint = Flags.U.CacheHint == 1;
+ bool IsIm2Col = Flags.U.LoadMode == 1;
+
+ SDLoc DL(N);
+ // List of operands that are common to both variants
+ SmallVector<SDValue, 4> Ops{
+ N->getOperand(3), // Src pointer in smem
+ N->getOperand(4), // Dst tensor_map pointer in gmem
+ };
+
+ // Tensor Dims from [1-5] followed by the cache-hint operand
+ size_t TensorDimsStartIndex = 5;
+ size_t CacheHintIndex = TensorDimsStartIndex + NumDims;
+ for (size_t i = 0; i < NumDims; i++)
+ Ops.push_back(N->getOperand(TensorDimsStartIndex + i));
+
+ // Push the cache-hint operand, if available
+ if (IsCacheHint)
+ Ops.push_back(N->getOperand(CacheHintIndex));
+
+ // Finally, the chain operand
+ Ops.push_back(N->getOperand(0));
+
+ unsigned Opcode =
+ GetCpAsyncBulkTensorS2GOpcode(NumDims, IsShared32, IsCacheHint, IsIm2Col);
+
+ ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
+}
+
+void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2S(SDNode *N) {
+ unsigned int SharedPointerSize =
+ CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED);
+ bool IsShared32 = (SharedPointerSize == 32);
+
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ size_t NumDims = GetCpAsyncBulkTensorDimFromIntrinsic(IID);
+
+ ConstantSDNode *FlagsNode = cast<ConstantSDNode>(N->getOperand(2));
+ nvvm::CpAsyncBulkTensorFlags Flags;
+ Flags.V = static_cast<unsigned>(FlagsNode->getZExtValue());
+ bool IsCacheHint = Flags.U.CacheHint == 1;
+ bool IsMultiCast = Flags.U.MultiCast == 1;
+ bool IsIm2Col = Flags.U.LoadMode == 1;
+
+ if (IsIm2Col && NumDims < 3)
+ report_fatal_error("NumDims should be at least 3 for Im2Col mode");
+
+ SDLoc DL(N);
+ // List of operands that are common to both tile and im2col variants
+ SmallVector<SDValue, 4> Ops{
+ N->getOperand(3), // Dst pointer in smem
+ N->getOperand(4), // Mbarrier pointer in smem
+ N->getOperand(5), // Src pointer (i.e. tensor_map) in gmem
+ };
+
+ // Tensor Dims from [1-5]
+ size_t TensorDimsStartIndex = 6;
+ for (size_t i = 0; i < NumDims; i++)
+ Ops.push_back(N->getOperand(TensorDimsStartIndex + i));
+
+ // Im2Col co-ordinates:
+ // These are always present in the input arguments for TensorDims{3,4,5}.
+ // Number of values is (NumDims - 2).
+ size_t Im2ColStartIndex = TensorDimsStartIndex + NumDims;
+ size_t NumDimsIm2Col = (NumDims > 2) ? (NumDims - 2) : 0;
+ size_t Im2ColEndIndex = Im2ColStartIndex + NumDimsIm2Col;
+ // ...However, passed down to the actual NVPTX only when
+ // this mode is enabled.
+ if (IsIm2Col) {
+ for (size_t i = 0; i < NumDimsIm2Col; i++)
+ Ops.push_back(N->getOperand(Im2ColStartIndex + i));
+ }
+
+ // Push MultiCast operand, if available
+ if (IsMultiCast)
+ Ops.push_back(N->getOperand(Im2ColEndIndex));
+
+ // Push CacheHint operand, if available
+ if (IsCacheHint)
+ Ops.push_back(N->getOperand(Im2ColEndIndex + 1));
+
+ // Finally, the chain operand
+ Ops.push_back(N->getOperand(0));
+
+ unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
+ NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
+
+ ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
+}
+
+bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
+ unsigned IID = N->getConstantOperandVal(1);
+ switch (IID) {
+ default:
+ return false;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_1d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_2d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_3d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_4d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_5d:
+ SelectCpAsyncBulkTensorS2G(N);
+ return true;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_1d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_2d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_3d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_4d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_5d:
+ SelectCpAsyncBulkTensorG2S(N);
+ return true;
+ }
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index c5524351f2ff9..267019807ad8b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -57,6 +57,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
void Select(SDNode *N) override;
bool tryIntrinsicNoChain(SDNode *N);
bool tryIntrinsicChain(SDNode *N);
+ bool tryIntrinsicVoid(SDNode *N);
void SelectTexSurfHandle(SDNode *N);
bool tryLoad(SDNode *N);
bool tryLoadVector(SDNode *N);
@@ -74,6 +75,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool SelectSETP_F16X2(SDNode *N);
bool SelectSETP_BF16X2(SDNode *N);
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
+ void SelectCpAsyncBulkTensorS2G(SDNode *N);
+ void SelectCpAsyncBulkTensorG2S(SDNode *N);
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index a65170e56aa24..97c5b2b9a7f1c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -433,6 +433,374 @@ def CP_ASYNC_BULK_WAIT_GROUP_READ :
[(int_nvvm_cp_async_bulk_wait_group_read (i32 timm:$n))]>,
Requires<[hasPTX<80>, hasSM<90>]>;
+//-----------------------------------
+// TMA Async Copy Functions
+//-----------------------------------
+
+// From Shared to Global memory
+multiclass CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_1D<NVPTXRegClass rc> {
+ def "": NVPTXInst<(outs),
+ (ins rc:$src, Int64Regs:$tmap, Int32Regs:$d0),
+ !strconcat(
+ "cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group",
+ " [$tmap, \\{$d0\\}], [$src];"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+
+ def _CH: NVPTXInst<(outs),
+ (ins rc:$src, Int64Regs:$tmap, Int32Regs:$d0, Int64Regs:$cache_hint),
+ !strconcat(
+ "cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group",
+ ".L2::cache_hint",
+ " [$tmap, \\{$d0\\}], [$src], $cache_hint;"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+}
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_1D_TILE : CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_1D<Int64Regs>;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_1D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_1D<Int32Regs>;
+
+multiclass CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_2D<NVPTXRegClass rc> {
+ def "": NVPTXInst<(outs),
+ (ins rc:$src, Int64Regs:$tmap, Int32Regs:$d0, Int32Regs:$d1),
+ !strconcat(
+ "cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group",
+ " [$tmap, \\{$d0, $d1\\}], [$src];"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+
+ def _CH: NVPTXInst<(outs),
+ (ins rc:$src, Int64Regs:$tmap, Int32Regs:$d0, Int32Regs:$d1, Int64Regs:$cache_hint),
+ !strconcat(
+ "cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group",
+ ".L2::cache_hint",
+ " [$tmap, \\{$d0, $d1\\}], [$src], $cache_hint;"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+}
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_2D_TILE : CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_2D<Int64Regs>;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_2D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_2D<Int32Regs>;
+
+multiclass CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D<NVPTXRegClass rc, string mode> {
+ def "": NVPTXInst<(outs),
+ (ins rc:$src, Int64Regs:$tmap, Int32Regs:$d0, Int32Regs:$d1, Int32Regs:$d2),
+ !strconcat(
+ "cp.async.bulk.tensor.3d.global.shared::cta", mode, ".bulk_group",
+ " [$tmap, \\{$d0, $d1, $d2\\}], [$src];"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+ def _CH: NVPTXInst<(outs),
+ (ins rc:$src, Int64Regs:$tmap, Int32Regs:$d0, Int32Regs:$d1, Int32Regs:$d2, Int64Regs:$cache_hint),
+ !strconcat(
+ "cp.async.bulk.tensor.3d.global.shared::cta", mode, ".bulk_group",
+ ".L2::cache_hint",
+ " [$tmap, \\{$d0, $d1, $d2\\}], [$src], $cache_hint;"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+}
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D_TILE : CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D<Int64Regs, ".tile">;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D<Int32Regs, ".tile">;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D_IM2COL : CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D<Int64Regs, ".im2col_no_offs">;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D_SHARED32_IM2COL : CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D<Int32Regs, ".im2col_no_offs">;
+
+multiclass CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_4D<NVPTXRegClass rc, string mode> {
+ def "": NVPTXInst<(outs),
+ (ins rc:$src, Int64Regs:$tmap, Int32Regs:$d0, Int32Regs:$d1, Int32Regs:$d2, Int32Regs:$d3),
+ !strconcat(
+ "cp.async.bulk.tensor.4d.global.shared::cta", mode, ".bulk_group",
+ " [$tmap, \\{$d0, $d1, $d2, $d3\\}], [$src];"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+ def _CH: NVPTXInst<(outs),
+ (ins rc:$src, Int64Regs:$tmap, Int32Regs:$d0, Int32Regs:$d1, Int32Regs:$d2, Int32Regs:$d3, Int64Regs:$cache_hint),
+ !strconcat(
+ "cp.async.bulk.tensor.4d.global.shared::cta", mode, ".bulk_group",
+ ".L2::cache_hint",
+ " [$tmap, \\{$d0, $...
[truncated]
|
864b6f4
to
0f0a8e3
Compare
0f0a8e3
to
ffb6151
Compare
@Artem-B , Could you please help with review? |
Thank you Artem for looking at this! I will address the comments and refresh the patch. |
ffb6151
to
bc6ec96
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
d152681
to
ec3e81a
Compare
ca9a0ad
to
f7813ce
Compare
Fixed all clang-format issues. |
@Artem-B , Kindly help with review. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM overall with a couple of minor nits.
f7813ce
to
a953aa9
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, with a couple minor nits.
620fddf
to
c509b83
Compare
This patch adds NVVM intrinsics and NVPTX codeGen for: * cp.async.bulk.tensor.S2G.1D -> 5D variants, supporting both Tile and Im2Col modes. These intrinsics optionally support cache_hints as indicated by the boolean flag argument. * cp.async.bulk.tensor.G2S.1D -> 5D variants, with support for both Tile and Im2Col modes. The Im2Col variants have an extra set of offsets as parameters. These intrinsics optionally support multicast and cache_hints, as indicated by the boolean arguments at the end of the intrinsics. * The backend looks through these flag arguments and lowers to the appropriate PTX instruction. * Lit tests are added for all combinations of these intrinsics in cp-async-bulk-tensor-g2s/s2g.ll. * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst file. Signed-off-by: Durgadoss R <[email protected]>
c509b83
to
77c0deb
Compare
PR llvm#96083 added intrinsics for async copy of 'tensor' data using TMA. This PR adds intrinsics for async copy of bulk data (non-tensor variants) through TMA, following a similar design. * These intrinsics optionally support multicast and cache_hints, as indicated by the boolean arguments at the end of the intrinsics. * The backend looks through these flag arguments and lowers to the appropriate PTX instruction. * Lit tests are added for all combinations of these intrinsics in cp-async-bulk.ll. * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst file. PTX Spec reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk Signed-off-by: Durgadoss R <[email protected]>
PR llvm#96083 added intrinsics for async copy of 'tensor' data using TMA. This PR adds intrinsics for async copy of bulk data (non-tensor variants) through TMA, following a similar design. * These intrinsics optionally support multicast and cache_hints, as indicated by the boolean arguments at the end of the intrinsics. * The backend looks through these flag arguments and lowers to the appropriate PTX instruction. * Lit tests are added for all combinations of these intrinsics in cp-async-bulk.ll. * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst file. PTX Spec reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk Signed-off-by: Durgadoss R <[email protected]>
PR #96083 added intrinsics for async copy of 'tensor' data using TMA. Following a similar design, this PR adds intrinsics for async copy of bulk data (non-tensor variants) through TMA. * These intrinsics optionally support multicast and cache_hints, as indicated by the boolean arguments at the end of the intrinsics. * The backend looks through these flag arguments and lowers to the appropriate PTX instructions. * Lit tests are added for all combinations of these intrinsics in cp-async-bulk.ll. * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst file. PTX Spec reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk Signed-off-by: Durgadoss R <[email protected]>
PR llvm#96083 added intrinsics for async copy of 'tensor' data using TMA. Following a similar design, this PR adds intrinsics for async copy of bulk data (non-tensor variants) through TMA. * These intrinsics optionally support multicast and cache_hints, as indicated by the boolean arguments at the end of the intrinsics. * The backend looks through these flag arguments and lowers to the appropriate PTX instructions. * Lit tests are added for all combinations of these intrinsics in cp-async-bulk.ll. * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst file. PTX Spec reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk Signed-off-by: Durgadoss R <[email protected]>
This patch adds NVVM intrinsics and NVPTX codeGen for:
support cache_hints as indicated by the boolean flag argument.
an extra set of offsets as parameters. These intrinsics optionally support multicast and cache_hints, as indicated by the boolean
arguments at the end of the intrinsics.
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor