Skip to content

Commit 4624a1e

Browse files
tpoppStephan Herhut
authored and
Stephan Herhut
committed
[mlir] Create a gpu.module operation for the GPU Dialect.
Summary: This is based on the use of code constantly checking for an attribute on a model and instead represents the distinct operaion with a different op. Instead, this op can be used to provide better filtering. Reviewers: herhut, mravishankar, antiagainst, rriddle Reviewed By: herhut, antiagainst, rriddle Subscribers: liufengdb, aartbik, jholewinski, mgorny, mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, nicolasvasilache, csigg, arpith-jacob, mgester, lucyrfox, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D72336
1 parent 9492e9d commit 4624a1e

File tree

24 files changed

+235
-140
lines changed

24 files changed

+235
-140
lines changed

mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,17 @@ namespace mlir {
1919
class Location;
2020
class ModuleOp;
2121

22+
template <typename T>
23+
class OpPassBase;
24+
25+
namespace gpu {
26+
class GPUModuleOp;
27+
} // namespace gpu
28+
2229
namespace LLVM {
2330
class LLVMDialect;
2431
} // namespace LLVM
2532

26-
template <typename T> class OpPassBase;
27-
2833
using OwnedCubin = std::unique_ptr<std::vector<char>>;
2934
using CubinGenerator =
3035
std::function<OwnedCubin(const std::string &, Location, StringRef)>;
@@ -38,7 +43,7 @@ using CubinGenerator =
3843
/// attached as a string attribute named 'nvvm.cubin' to the kernel function.
3944
/// After the transformation, the body of the kernel function is removed (i.e.,
4045
/// it is turned into a declaration).
41-
std::unique_ptr<OpPassBase<ModuleOp>>
46+
std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
4247
createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
4348

4449
/// Creates a pass to convert a gpu.launch_func operation into a sequence of

mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,19 @@ namespace mlir {
1414
class LLVMTypeConverter;
1515
class OwningRewritePatternList;
1616

17-
class ModuleOp;
18-
template <typename OpT> class OpPassBase;
17+
template <typename OpT>
18+
class OpPassBase;
19+
20+
namespace gpu {
21+
class GPUModuleOp;
22+
}
1923

2024
/// Collect a set of patterns to convert from the GPU dialect to NVVM.
2125
void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
2226
OwningRewritePatternList &patterns);
2327

2428
/// Creates a pass that lowers GPU dialect operations to NVVM counterparts.
25-
std::unique_ptr<OpPassBase<ModuleOp>> createLowerGpuOpsToNVVMOpsPass();
29+
std::unique_ptr<OpPassBase<gpu::GPUModuleOp>> createLowerGpuOpsToNVVMOpsPass();
2630

2731
} // namespace mlir
2832

mlir/include/mlir/Dialect/GPU/GPUOps.td

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,4 +588,56 @@ def GPU_BarrierOp : GPU_Op<"barrier"> {
588588
let printer = [{ p << getOperationName(); }];
589589
}
590590

591+
def GPU_GPUModuleOp : GPU_Op<"module", [
592+
IsolatedFromAbove, SymbolTable, Symbol,
593+
SingleBlockImplicitTerminator<"ModuleEndOp">
594+
]> {
595+
let summary = "A top level compilation unit containing code to be run on a GPU.";
596+
let description = [{
597+
GPU module contains code that is intended to be run on a GPU. A host device
598+
can launch this code through a gpu.launc_func that creates a fully
599+
qualified symbol through the gpu.module's symbol and a gpu.func symbol
600+
contained in the gpu.module.
601+
602+
The module's top-level scope is modeled by a single region with a single
603+
block. GPU modules are required to have a name that is used for symbol
604+
resolution by the gpu.launch_func operation.
605+
606+
Using an op with a region to define a GPU module enables "embedding" GPU
607+
modules with SIMT execution models in other dialects in a clean manner and
608+
allows filtering of code regions to execute passes on only code intended to
609+
or not intended to be run on the separate device.
610+
611+
```
612+
gpu.module @symbol_name {
613+
gpu.func {}
614+
...
615+
gpu.module_end
616+
}
617+
618+
```
619+
}];
620+
let builders = [OpBuilder<"Builder *builder, OperationState &result, "
621+
"StringRef name">];
622+
let parser = [{ return ::parseGPUModuleOp(parser, result); }];
623+
let printer = [{ return ::print(p, *this); }];
624+
let regions = (region SizedRegion<1>:$body);
625+
626+
// We need to ensure the block inside the region is properly terminated;
627+
// the auto-generated builders do not guarantee that.
628+
let skipDefaultBuilders = 1;
629+
}
630+
631+
def GPU_ModuleEndOp : GPU_Op<"module_end", [
632+
Terminator, HasParent<"GPUModuleOp">
633+
]> {
634+
let summary = "A pseudo op that marks the end of a gpu.module.";
635+
let description = [{
636+
This op terminates the only block inside the only region of a `gpu.module`.
637+
}];
638+
639+
let parser = [{ return success(); }];
640+
let printer = [{ p << getOperationName(); }];
641+
}
642+
591643
#endif // GPU_OPS

mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,15 @@ static constexpr const char *kCubinAnnotation = "nvvm.cubin";
4646
/// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
4747
/// GPU binary code, which is then attached as an attribute to the function. The
4848
/// function body is erased.
49-
class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
49+
class GpuKernelToCubinPass
50+
: public OperationPass<GpuKernelToCubinPass, gpu::GPUModuleOp> {
5051
public:
5152
GpuKernelToCubinPass(
5253
CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
5354
: cubinGenerator(cubinGenerator) {}
5455

55-
void runOnModule() override {
56-
ModuleOp module = getModule();
57-
if (!module.getAttrOfType<UnitAttr>(
58-
gpu::GPUDialect::getKernelModuleAttrName()) ||
59-
!module.getName())
60-
return;
56+
void runOnOperation() override {
57+
gpu::GPUModuleOp module = getOperation();
6158

6259
// Make sure the NVPTX target is initialized.
6360
LLVMInitializeNVPTXTarget();
@@ -71,8 +68,8 @@ class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
7168

7269
// Translate the module to CUBIN and attach the result as attribute to the
7370
// module.
74-
if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
75-
*llvmModule, module.getLoc(), *module.getName()))
71+
if (auto cubinAttr = translateGPUModuleToCubinAnnotation(
72+
*llvmModule, module.getLoc(), module.getName()))
7673
module.setAttr(kCubinAnnotation, cubinAttr);
7774
else
7875
signalPassFailure();
@@ -92,7 +89,7 @@ class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
9289
StringRef name);
9390

9491
/// Translates llvmModule to cubin and returns the result as attribute.
95-
StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
92+
StringAttr translateGPUModuleToCubinAnnotation(llvm::Module &llvmModule,
9693
Location loc, StringRef name);
9794

9895
CubinGenerator cubinGenerator;
@@ -149,15 +146,15 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
149146
return cubinGenerator(ptx, loc, name);
150147
}
151148

152-
StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
149+
StringAttr GpuKernelToCubinPass::translateGPUModuleToCubinAnnotation(
153150
llvm::Module &llvmModule, Location loc, StringRef name) {
154151
auto cubin = convertModuleToCubin(llvmModule, loc, name);
155152
if (!cubin)
156153
return {};
157154
return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
158155
}
159156

160-
std::unique_ptr<OpPassBase<ModuleOp>>
157+
std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
161158
mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
162159
return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
163160
}

mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,9 @@ class GpuLaunchFuncToCudaCallsPass
132132

133133
// GPU kernel modules are no longer necessary since we have a global
134134
// constant with the CUBIN data.
135-
for (auto m : llvm::make_early_inc_range(getModule().getOps<ModuleOp>()))
136-
if (m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
137-
m.erase();
135+
for (auto m :
136+
llvm::make_early_inc_range(getModule().getOps<gpu::GPUModuleOp>()))
137+
m.erase();
138138
}
139139

140140
private:
@@ -343,8 +343,8 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
343343
builder.getI32IntegerAttr(0));
344344
// Create an LLVM global with CUBIN extracted from the kernel annotation and
345345
// obtain a pointer to the first byte in it.
346-
auto kernelModule =
347-
getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
346+
auto kernelModule = getModule().lookupSymbol<gpu::GPUModuleOp>(
347+
launchOp.getKernelModuleName());
348348
assert(kernelModule && "expected a kernel module");
349349

350350
auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
@@ -354,8 +354,7 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
354354
return signalPassFailure();
355355
}
356356

357-
assert(kernelModule.getName() && "expected a named module");
358-
SmallString<128> nameBuffer(*kernelModule.getName());
357+
SmallString<128> nameBuffer(kernelModule.getName());
359358
nameBuffer.append(kCubinStorageSuffix);
360359
Value data = LLVM::createGlobalString(
361360
loc, builder, nameBuffer.str(), cubinAttr.getValue(),

mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
200200
auto type = operand.getType().cast<LLVM::LLVMType>();
201201

202202
// Create shared memory array to store the warp reduction.
203-
auto module = operand.getDefiningOp()->getParentOfType<ModuleOp>();
203+
auto module = operand.getDefiningOp()->getParentOfType<gpu::GPUModuleOp>();
204204
assert(module && "op must belong to a module");
205205
Value sharedMemPtr =
206206
createSharedMemoryArray(loc, module, type, kWarpSize, rewriter);
@@ -391,10 +391,10 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
391391
}
392392

393393
/// Creates a global array stored in shared memory.
394-
Value createSharedMemoryArray(Location loc, ModuleOp module,
394+
Value createSharedMemoryArray(Location loc, gpu::GPUModuleOp module,
395395
LLVM::LLVMType elementType, int numElements,
396396
ConversionPatternRewriter &rewriter) const {
397-
OpBuilder builder(module.getBodyRegion());
397+
OpBuilder builder(module.body());
398398

399399
auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
400400
StringRef name = "reduce_buffer";
@@ -699,13 +699,11 @@ struct GPUReturnOpLowering : public LLVMOpLowering {
699699
///
700700
/// This pass only handles device code and is not meant to be run on GPU host
701701
/// code.
702-
class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> {
702+
class LowerGpuOpsToNVVMOpsPass
703+
: public OperationPass<LowerGpuOpsToNVVMOpsPass, gpu::GPUModuleOp> {
703704
public:
704-
void runOnModule() override {
705-
ModuleOp m = getModule();
706-
if (!m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
707-
return;
708-
705+
void runOnOperation() override {
706+
gpu::GPUModuleOp m = getOperation();
709707
OwningRewritePatternList patterns;
710708
NVVMTypeConverter converter(m.getContext());
711709
populateStdToLLVMConversionPatterns(converter, patterns);
@@ -718,7 +716,7 @@ class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> {
718716
target.addLegalDialect<LLVM::LLVMDialect>();
719717
target.addLegalDialect<NVVM::NVVMDialect>();
720718
// TODO(csigg): Remove once we support replacing non-root ops.
721-
target.addLegalOp<gpu::YieldOp>();
719+
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
722720
if (failed(applyPartialConversion(m, target, patterns, &converter)))
723721
signalPassFailure();
724722
}
@@ -750,7 +748,8 @@ void mlir::populateGpuToNVVMConversionPatterns(
750748
"__nv_exp");
751749
}
752750

753-
std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLowerGpuOpsToNVVMOpsPass() {
751+
std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
752+
mlir::createLowerGpuOpsToNVVMOpsPass() {
754753
return std::make_unique<LowerGpuOpsToNVVMOpsPass>();
755754
}
756755

mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,15 @@
1+
set(LLVM_TARGET_DEFINITIONS GPUToSPIRV.td)
2+
mlir_tablegen(GPUToSPIRV.cpp.inc -gen-rewriters)
3+
add_public_tablegen_target(MLIRGPUToSPIRVIncGen)
4+
15
add_llvm_library(MLIRGPUtoSPIRVTransforms
26
ConvertGPUToSPIRV.cpp
37
ConvertGPUToSPIRVPass.cpp
48
)
59

10+
add_dependencies(MLIRGPUtoSPIRVTransforms
11+
MLIRGPUToSPIRVIncGen)
12+
613
target_link_libraries(MLIRGPUtoSPIRVTransforms
714
MLIRGPU
815
MLIRIR

mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp

Lines changed: 15 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -63,27 +63,13 @@ class KernelFnConversion final : public SPIRVOpLowering<gpu::GPUFuncOp> {
6363
SmallVector<int32_t, 3> workGroupSizeAsInt32;
6464
};
6565

66-
/// Pattern to convert a module with gpu.kernel_module attribute to a
67-
/// spv.module.
68-
class KernelModuleConversion final : public SPIRVOpLowering<ModuleOp> {
66+
/// Pattern to convert a gpu.module to a spv.module.
67+
class GPUModuleConversion final : public SPIRVOpLowering<gpu::GPUModuleOp> {
6968
public:
70-
using SPIRVOpLowering<ModuleOp>::SPIRVOpLowering;
69+
using SPIRVOpLowering<gpu::GPUModuleOp>::SPIRVOpLowering;
7170

7271
PatternMatchResult
73-
matchAndRewrite(ModuleOp moduleOp, ArrayRef<Value> operands,
74-
ConversionPatternRewriter &rewriter) const override;
75-
};
76-
77-
/// Pattern to convert a module terminator op to a terminator of spv.module op.
78-
// TODO: Move this into DRR, but that requires ModuleTerminatorOp to be defined
79-
// in ODS.
80-
class KernelModuleTerminatorConversion final
81-
: public SPIRVOpLowering<ModuleTerminatorOp> {
82-
public:
83-
using SPIRVOpLowering<ModuleTerminatorOp>::SPIRVOpLowering;
84-
85-
PatternMatchResult
86-
matchAndRewrite(ModuleTerminatorOp terminatorOp, ArrayRef<Value> operands,
72+
matchAndRewrite(gpu::GPUModuleOp moduleOp, ArrayRef<Value> operands,
8773
ConversionPatternRewriter &rewriter) const override;
8874
};
8975

@@ -284,16 +270,12 @@ KernelFnConversion::matchAndRewrite(gpu::GPUFuncOp funcOp,
284270
}
285271

286272
//===----------------------------------------------------------------------===//
287-
// ModuleOp with gpu.kernel_module.
273+
// ModuleOp with gpu.module.
288274
//===----------------------------------------------------------------------===//
289275

290-
PatternMatchResult KernelModuleConversion::matchAndRewrite(
291-
ModuleOp moduleOp, ArrayRef<Value> operands,
276+
PatternMatchResult GPUModuleConversion::matchAndRewrite(
277+
gpu::GPUModuleOp moduleOp, ArrayRef<Value> operands,
292278
ConversionPatternRewriter &rewriter) const {
293-
if (!moduleOp.getAttrOfType<UnitAttr>(
294-
gpu::GPUDialect::getKernelModuleAttrName())) {
295-
return matchFailure();
296-
}
297279
// TODO : Generalize this to account for different extensions,
298280
// capabilities, extended_instruction_sets, other addressing models
299281
// and memory models.
@@ -302,8 +284,8 @@ PatternMatchResult KernelModuleConversion::matchAndRewrite(
302284
spirv::MemoryModel::GLSL450, spirv::Capability::Shader,
303285
spirv::Extension::SPV_KHR_storage_buffer_storage_class);
304286
// Move the region from the module op into the SPIR-V module.
305-
Region &spvModuleRegion = spvModule.getOperation()->getRegion(0);
306-
rewriter.inlineRegionBefore(moduleOp.getBodyRegion(), spvModuleRegion,
287+
Region &spvModuleRegion = spvModule.body();
288+
rewriter.inlineRegionBefore(moduleOp.body(), spvModuleRegion,
307289
spvModuleRegion.begin());
308290
// The spv.module build method adds a block with a terminator. Remove that
309291
// block. The terminator of the module op in the remaining block will be
@@ -313,17 +295,6 @@ PatternMatchResult KernelModuleConversion::matchAndRewrite(
313295
return matchSuccess();
314296
}
315297

316-
//===----------------------------------------------------------------------===//
317-
// ModuleTerminatorOp for gpu.kernel_module.
318-
//===----------------------------------------------------------------------===//
319-
320-
PatternMatchResult KernelModuleTerminatorConversion::matchAndRewrite(
321-
ModuleTerminatorOp terminatorOp, ArrayRef<Value> operands,
322-
ConversionPatternRewriter &rewriter) const {
323-
rewriter.replaceOpWithNewOp<spirv::ModuleEndOp>(terminatorOp);
324-
return matchSuccess();
325-
}
326-
327298
//===----------------------------------------------------------------------===//
328299
// GPU return inside kernel functions to SPIR-V return.
329300
//===----------------------------------------------------------------------===//
@@ -342,14 +313,18 @@ PatternMatchResult GPUReturnOpConversion::matchAndRewrite(
342313
// GPU To SPIRV Patterns.
343314
//===----------------------------------------------------------------------===//
344315

316+
namespace {
317+
#include "GPUToSPIRV.cpp.inc"
318+
}
319+
345320
void mlir::populateGPUToSPIRVPatterns(MLIRContext *context,
346321
SPIRVTypeConverter &typeConverter,
347322
OwningRewritePatternList &patterns,
348323
ArrayRef<int64_t> workGroupSize) {
324+
populateWithGenerated(context, &patterns);
349325
patterns.insert<KernelFnConversion>(context, typeConverter, workGroupSize);
350326
patterns.insert<
351-
GPUReturnOpConversion, ForOpConversion, KernelModuleConversion,
352-
KernelModuleTerminatorConversion,
327+
GPUReturnOpConversion, ForOpConversion, GPUModuleConversion,
353328
LaunchConfigConversion<gpu::BlockDimOp, spirv::BuiltIn::WorkgroupSize>,
354329
LaunchConfigConversion<gpu::BlockIdOp, spirv::BuiltIn::WorkgroupId>,
355330
LaunchConfigConversion<gpu::GridDimOp, spirv::BuiltIn::NumWorkgroups>,

0 commit comments

Comments
 (0)