Skip to content

Commit aebd47a

Browse files
mbrkusaninpetar-avramovicpiotrAMD
committed
[AMDGPU] Add GFX12 WMMA and SWMMAC instructions
Co-authored-by: Petar Avramovic <[email protected]> Co-authored-by: Piotr Sobczak <[email protected]>
1 parent 4d46721 commit aebd47a

File tree

62 files changed

+17718
-93
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+17718
-93
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,67 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
423423
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
424424
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
425425

426+
//===----------------------------------------------------------------------===//
427+
// WMMA builtins.
428+
// Postfix w32 indicates the builtin requires wavefront size of 32.
429+
// Postfix w64 indicates the builtin requires wavefront size of 64.
430+
//
431+
// Some of these are very similar to their GFX11 counterparts, but they don't
432+
// require replication of the A,B matrices, so they use fewer vector elements.
433+
// Therefore, we add an "_gfx12" suffix to distinguish them from the existing
434+
// builtins.
435+
//===----------------------------------------------------------------------===//
436+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32")
437+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32")
438+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32")
439+
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32")
440+
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
441+
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")
442+
// These are gfx12-only, but for consistency with the other WMMA variants we're
443+
// keeping the "_gfx12" suffix.
444+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
445+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
446+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
447+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
448+
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
449+
450+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64")
451+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64")
452+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64")
453+
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64")
454+
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
455+
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
456+
// These are gfx12-only, but for consistency with the other WMMA variants we're
457+
// keeping the "_gfx12" suffix.
458+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
459+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
460+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
461+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
462+
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
463+
464+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32, "V8fV8hV16hV8fs", "nc", "gfx12-insts,wavefrontsize32")
465+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32, "V8fV8sV16sV8fs", "nc", "gfx12-insts,wavefrontsize32")
466+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32, "V8hV8hV16hV8hs", "nc", "gfx12-insts,wavefrontsize32")
467+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32, "V8sV8sV16sV8ss", "nc", "gfx12-insts,wavefrontsize32")
468+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
469+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32, "V8iIbiIbV2iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
470+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
471+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
472+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
473+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
474+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
475+
476+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64, "V4fV4hV8hV4fs", "nc", "gfx12-insts,wavefrontsize64")
477+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64, "V4fV4sV8sV4fs", "nc", "gfx12-insts,wavefrontsize64")
478+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64, "V4hV4hV8hV4hs", "nc", "gfx12-insts,wavefrontsize64")
479+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64, "V4sV4sV8sV4ss", "nc", "gfx12-insts,wavefrontsize64")
480+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
481+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64, "V4iIbiIbiV4isIb", "nc", "gfx12-insts,wavefrontsize64")
482+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
483+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
484+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
485+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
486+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
426487

427488
#undef BUILTIN
428489
#undef TARGET_BUILTIN

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 159 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18240,65 +18240,211 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1824018240
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1824118241
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
1824218242
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
18243-
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: {
18243+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
18244+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
18245+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
18246+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
18247+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
18248+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
18249+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
18250+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
18251+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
18252+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
18253+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
18254+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
18255+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
18256+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
18257+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
18258+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
18259+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
18260+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
18261+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
18262+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
18263+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
18264+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
18265+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
18266+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
18267+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
18268+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
18269+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
18270+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
18271+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
18272+
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
18273+
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
18274+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
18275+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
18276+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
18277+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
18278+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
18279+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
18280+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
18281+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
18282+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
18283+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
18284+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
18285+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
18286+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
18287+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
1824418288

1824518289
// These operations perform a matrix multiplication and accumulation of
1824618290
// the form:
1824718291
// D = A * B + C
18248-
// The return type always matches the type of matrix C.
18249-
unsigned ArgForMatchingRetType;
18292+
// We need to specify one type for matrices AB and one for matrices CD.
18293+
SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
18294+
// Some intrinsics expect "false" as an extra bool argument.
18295+
bool AppendExtraBoolArg = false;
1825018296
unsigned BuiltinWMMAOp;
1825118297

1825218298
switch (BuiltinID) {
1825318299
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
1825418300
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
18255-
ArgForMatchingRetType = 2;
18301+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
18302+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
18303+
ArgsForMatchingMatrixTypes = {0, 2};
1825618304
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
1825718305
break;
1825818306
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1825918307
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
18260-
ArgForMatchingRetType = 2;
18308+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
18309+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
18310+
ArgsForMatchingMatrixTypes = {0, 2};
1826118311
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
1826218312
break;
18313+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
18314+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
18315+
AppendExtraBoolArg = true;
18316+
LLVM_FALLTHROUGH;
1826318317
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
1826418318
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
18265-
ArgForMatchingRetType = 2;
18319+
ArgsForMatchingMatrixTypes = {0, 2};
1826618320
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
1826718321
break;
18322+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
18323+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
18324+
AppendExtraBoolArg = true;
18325+
LLVM_FALLTHROUGH;
1826818326
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
1826918327
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
18270-
ArgForMatchingRetType = 2;
18328+
ArgsForMatchingMatrixTypes = {0, 2};
1827118329
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
1827218330
break;
1827318331
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1827418332
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
18275-
ArgForMatchingRetType = 2;
18333+
ArgsForMatchingMatrixTypes = {0, 2};
1827618334
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
1827718335
break;
1827818336
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1827918337
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
18280-
ArgForMatchingRetType = 2;
18338+
ArgsForMatchingMatrixTypes = {0, 2};
1828118339
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
1828218340
break;
1828318341
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1828418342
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
18285-
ArgForMatchingRetType = 4;
18343+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
18344+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
18345+
ArgsForMatchingMatrixTypes = {1, 4};
1828618346
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
1828718347
break;
1828818348
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
1828918349
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
18290-
ArgForMatchingRetType = 4;
18350+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
18351+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
18352+
ArgsForMatchingMatrixTypes = {1, 4};
1829118353
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
1829218354
break;
18355+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
18356+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
18357+
ArgsForMatchingMatrixTypes = {0, 2};
18358+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
18359+
break;
18360+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
18361+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
18362+
ArgsForMatchingMatrixTypes = {0, 2};
18363+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
18364+
break;
18365+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
18366+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
18367+
ArgsForMatchingMatrixTypes = {0, 2};
18368+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
18369+
break;
18370+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
18371+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
18372+
ArgsForMatchingMatrixTypes = {0, 2};
18373+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
18374+
break;
18375+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
18376+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
18377+
ArgsForMatchingMatrixTypes = {1, 4};
18378+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
18379+
break;
18380+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
18381+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
18382+
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18383+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
18384+
break;
18385+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
18386+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
18387+
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18388+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
18389+
break;
18390+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
18391+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
18392+
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18393+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
18394+
break;
18395+
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
18396+
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
18397+
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18398+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
18399+
break;
18400+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
18401+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
18402+
ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
18403+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
18404+
break;
18405+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
18406+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
18407+
ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
18408+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
18409+
break;
18410+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
18411+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
18412+
ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
18413+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
18414+
break;
18415+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
18416+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
18417+
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18418+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
18419+
break;
18420+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
18421+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
18422+
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18423+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
18424+
break;
18425+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
18426+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
18427+
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18428+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
18429+
break;
18430+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
18431+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
18432+
ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
18433+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
18434+
break;
1829318435
}
1829418436

1829518437
SmallVector<Value *, 6> Args;
1829618438
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
1829718439
Args.push_back(EmitScalarExpr(E->getArg(i)));
18440+
if (AppendExtraBoolArg)
18441+
Args.push_back(Builder.getFalse());
1829818442

18299-
Function *F = CGM.getIntrinsic(BuiltinWMMAOp,
18300-
{Args[ArgForMatchingRetType]->getType()});
18443+
SmallVector<llvm::Type *, 6> ArgTypes;
18444+
for (auto ArgIdx : ArgsForMatchingMatrixTypes)
18445+
ArgTypes.push_back(Args[ArgIdx]->getType());
1830118446

18447+
Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
1830218448
return Builder.CreateCall(F, Args);
1830318449
}
1830418450

0 commit comments

Comments
 (0)