Skip to content

Commit 9a52ea5

Browse files
committed
Create a gpu.module operation for the GPU Dialect.
Summary: This is based on the use of code constantly checking for an attribute on a model and instead represents the distinct operaion with a different op. Instead, this op can be used to provide better filtering. Reverts "Revert "[mlir] Create a gpu.module operation for the GPU Dialect."" This reverts commit ac44630 after fixing internal Google issues. This additionally updates ROCDL lowering to use the new gpu.module. Reviewers: herhut, mravishankar, antiagainst, nicolasvasilache Subscribers: jholewinski, mgorny, mehdi_amini, jpienaar, burmako, shauheen, csigg, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, llvm-commits, mravishankar, rriddle, antiagainst, bkramer Tags: #llvm Differential Revision: https://reviews.llvm.org/D72921
1 parent 87632b9 commit 9a52ea5

28 files changed

+250
-153
lines changed

mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,16 @@ namespace mlir {
1919
class Location;
2020
class ModuleOp;
2121

22+
template <typename T> class OpPassBase;
23+
24+
namespace gpu {
25+
class GPUModuleOp;
26+
} // namespace gpu
27+
2228
namespace LLVM {
2329
class LLVMDialect;
2430
} // namespace LLVM
2531

26-
template <typename T> class OpPassBase;
27-
2832
using OwnedCubin = std::unique_ptr<std::vector<char>>;
2933
using CubinGenerator =
3034
std::function<OwnedCubin(const std::string &, Location, StringRef)>;
@@ -38,7 +42,7 @@ using CubinGenerator =
3842
/// attached as a string attribute named 'nvvm.cubin' to the kernel function.
3943
/// After the transformation, the body of the kernel function is removed (i.e.,
4044
/// it is turned into a declaration).
41-
std::unique_ptr<OpPassBase<ModuleOp>>
45+
std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
4246
createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
4347

4448
/// Creates a pass to convert a gpu.launch_func operation into a sequence of

mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,18 @@ namespace mlir {
1414
class LLVMTypeConverter;
1515
class OwningRewritePatternList;
1616

17-
class ModuleOp;
1817
template <typename OpT> class OpPassBase;
1918

19+
namespace gpu {
20+
class GPUModuleOp;
21+
}
22+
2023
/// Collect a set of patterns to convert from the GPU dialect to NVVM.
2124
void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
2225
OwningRewritePatternList &patterns);
2326

2427
/// Creates a pass that lowers GPU dialect operations to NVVM counterparts.
25-
std::unique_ptr<OpPassBase<ModuleOp>> createLowerGpuOpsToNVVMOpsPass();
28+
std::unique_ptr<OpPassBase<gpu::GPUModuleOp>> createLowerGpuOpsToNVVMOpsPass();
2629

2730
} // namespace mlir
2831

mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,13 @@
1212

1313
namespace mlir {
1414

15-
class ModuleOp;
15+
namespace gpu {
16+
class GPUModuleOp;
17+
} // namespace gpu
1618
template <typename OpT> class OpPassBase;
1719

1820
/// Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
19-
std::unique_ptr<OpPassBase<ModuleOp>> createLowerGpuOpsToROCDLOpsPass();
21+
std::unique_ptr<OpPassBase<gpu::GPUModuleOp>> createLowerGpuOpsToROCDLOpsPass();
2022

2123
} // namespace mlir
2224

mlir/include/mlir/Dialect/GPU/GPUOps.td

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,4 +598,56 @@ def GPU_BarrierOp : GPU_Op<"barrier"> {
598598
let printer = [{ p << getOperationName(); }];
599599
}
600600

601+
def GPU_GPUModuleOp : GPU_Op<"module", [
602+
IsolatedFromAbove, SymbolTable, Symbol,
603+
SingleBlockImplicitTerminator<"ModuleEndOp">
604+
]> {
605+
let summary = "A top level compilation unit containing code to be run on a GPU.";
606+
let description = [{
607+
GPU module contains code that is intended to be run on a GPU. A host device
608+
can launch this code through a gpu.launc_func that creates a fully
609+
qualified symbol through the gpu.module's symbol and a gpu.func symbol
610+
contained in the gpu.module.
611+
612+
The module's top-level scope is modeled by a single region with a single
613+
block. GPU modules are required to have a name that is used for symbol
614+
resolution by the gpu.launch_func operation.
615+
616+
Using an op with a region to define a GPU module enables "embedding" GPU
617+
modules with SIMT execution models in other dialects in a clean manner and
618+
allows filtering of code regions to execute passes on only code intended to
619+
or not intended to be run on the separate device.
620+
621+
```
622+
gpu.module @symbol_name {
623+
gpu.func {}
624+
...
625+
gpu.module_end
626+
}
627+
628+
```
629+
}];
630+
let builders = [OpBuilder<"Builder *builder, OperationState &result, "
631+
"StringRef name">];
632+
let parser = [{ return ::parseGPUModuleOp(parser, result); }];
633+
let printer = [{ return ::print(p, *this); }];
634+
let regions = (region SizedRegion<1>:$body);
635+
636+
// We need to ensure the block inside the region is properly terminated;
637+
// the auto-generated builders do not guarantee that.
638+
let skipDefaultBuilders = 1;
639+
}
640+
641+
def GPU_ModuleEndOp : GPU_Op<"module_end", [
642+
Terminator, HasParent<"GPUModuleOp">
643+
]> {
644+
let summary = "A pseudo op that marks the end of a gpu.module.";
645+
let description = [{
646+
This op terminates the only block inside the only region of a `gpu.module`.
647+
}];
648+
649+
let parser = [{ return success(); }];
650+
let printer = [{ p << getOperationName(); }];
651+
}
652+
601653
#endif // GPU_OPS

mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,15 @@ static constexpr const char *kCubinAnnotation = "nvvm.cubin";
4646
/// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
4747
/// GPU binary code, which is then attached as an attribute to the function. The
4848
/// function body is erased.
49-
class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
49+
class GpuKernelToCubinPass
50+
: public OperationPass<GpuKernelToCubinPass, gpu::GPUModuleOp> {
5051
public:
5152
GpuKernelToCubinPass(
5253
CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
5354
: cubinGenerator(cubinGenerator) {}
5455

55-
void runOnModule() override {
56-
ModuleOp module = getModule();
57-
if (!module.getAttrOfType<UnitAttr>(
58-
gpu::GPUDialect::getKernelModuleAttrName()) ||
59-
!module.getName())
60-
return;
56+
void runOnOperation() override {
57+
gpu::GPUModuleOp module = getOperation();
6158

6259
// Make sure the NVPTX target is initialized.
6360
LLVMInitializeNVPTXTarget();
@@ -71,8 +68,8 @@ class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
7168

7269
// Translate the module to CUBIN and attach the result as attribute to the
7370
// module.
74-
if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
75-
*llvmModule, module.getLoc(), *module.getName()))
71+
if (auto cubinAttr = translateGPUModuleToCubinAnnotation(
72+
*llvmModule, module.getLoc(), module.getName()))
7673
module.setAttr(kCubinAnnotation, cubinAttr);
7774
else
7875
signalPassFailure();
@@ -92,7 +89,7 @@ class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
9289
StringRef name);
9390

9491
/// Translates llvmModule to cubin and returns the result as attribute.
95-
StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
92+
StringAttr translateGPUModuleToCubinAnnotation(llvm::Module &llvmModule,
9693
Location loc, StringRef name);
9794

9895
CubinGenerator cubinGenerator;
@@ -149,15 +146,15 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
149146
return cubinGenerator(ptx, loc, name);
150147
}
151148

152-
StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
149+
StringAttr GpuKernelToCubinPass::translateGPUModuleToCubinAnnotation(
153150
llvm::Module &llvmModule, Location loc, StringRef name) {
154151
auto cubin = convertModuleToCubin(llvmModule, loc, name);
155152
if (!cubin)
156153
return {};
157154
return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
158155
}
159156

160-
std::unique_ptr<OpPassBase<ModuleOp>>
157+
std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
161158
mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
162159
return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
163160
}

mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,9 @@ class GpuLaunchFuncToCudaCallsPass
132132

133133
// GPU kernel modules are no longer necessary since we have a global
134134
// constant with the CUBIN data.
135-
for (auto m : llvm::make_early_inc_range(getModule().getOps<ModuleOp>()))
136-
if (m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
137-
m.erase();
135+
for (auto m :
136+
llvm::make_early_inc_range(getModule().getOps<gpu::GPUModuleOp>()))
137+
m.erase();
138138
}
139139

140140
private:
@@ -343,8 +343,8 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
343343
builder.getI32IntegerAttr(0));
344344
// Create an LLVM global with CUBIN extracted from the kernel annotation and
345345
// obtain a pointer to the first byte in it.
346-
auto kernelModule =
347-
getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
346+
auto kernelModule = getModule().lookupSymbol<gpu::GPUModuleOp>(
347+
launchOp.getKernelModuleName());
348348
assert(kernelModule && "expected a kernel module");
349349

350350
auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
@@ -354,8 +354,7 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
354354
return signalPassFailure();
355355
}
356356

357-
assert(kernelModule.getName() && "expected a named module");
358-
SmallString<128> nameBuffer(*kernelModule.getName());
357+
SmallString<128> nameBuffer(kernelModule.getName());
359358
nameBuffer.append(kCubinStorageSuffix);
360359
Value data = LLVM::createGlobalString(
361360
loc, builder, nameBuffer.str(), cubinAttr.getValue(),

mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
200200
auto type = operand.getType().cast<LLVM::LLVMType>();
201201

202202
// Create shared memory array to store the warp reduction.
203-
auto module = operand.getDefiningOp()->getParentOfType<ModuleOp>();
203+
auto module = operand.getDefiningOp()->getParentOfType<gpu::GPUModuleOp>();
204204
assert(module && "op must belong to a module");
205205
Value sharedMemPtr =
206206
createSharedMemoryArray(loc, module, type, kWarpSize, rewriter);
@@ -391,10 +391,10 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
391391
}
392392

393393
/// Creates a global array stored in shared memory.
394-
Value createSharedMemoryArray(Location loc, ModuleOp module,
394+
Value createSharedMemoryArray(Location loc, gpu::GPUModuleOp module,
395395
LLVM::LLVMType elementType, int numElements,
396396
ConversionPatternRewriter &rewriter) const {
397-
OpBuilder builder(module.getBodyRegion());
397+
OpBuilder builder(module.body());
398398

399399
auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
400400
StringRef name = "reduce_buffer";
@@ -699,13 +699,11 @@ struct GPUReturnOpLowering : public LLVMOpLowering {
699699
///
700700
/// This pass only handles device code and is not meant to be run on GPU host
701701
/// code.
702-
class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> {
702+
class LowerGpuOpsToNVVMOpsPass
703+
: public OperationPass<LowerGpuOpsToNVVMOpsPass, gpu::GPUModuleOp> {
703704
public:
704-
void runOnModule() override {
705-
ModuleOp m = getModule();
706-
if (!m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
707-
return;
708-
705+
void runOnOperation() override {
706+
gpu::GPUModuleOp m = getOperation();
709707
OwningRewritePatternList patterns;
710708
NVVMTypeConverter converter(m.getContext());
711709
populateStdToLLVMConversionPatterns(converter, patterns);
@@ -718,7 +716,7 @@ class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> {
718716
target.addLegalDialect<LLVM::LLVMDialect>();
719717
target.addLegalDialect<NVVM::NVVMDialect>();
720718
// TODO(csigg): Remove once we support replacing non-root ops.
721-
target.addLegalOp<gpu::YieldOp>();
719+
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
722720
if (failed(applyPartialConversion(m, target, patterns, &converter)))
723721
signalPassFailure();
724722
}
@@ -750,7 +748,8 @@ void mlir::populateGpuToNVVMConversionPatterns(
750748
"__nv_exp");
751749
}
752750

753-
std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLowerGpuOpsToNVVMOpsPass() {
751+
std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
752+
mlir::createLowerGpuOpsToNVVMOpsPass() {
754753
return std::make_unique<LowerGpuOpsToNVVMOpsPass>();
755754
}
756755

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,11 @@ namespace {
3131
//
3232
// This pass only handles device code and is not meant to be run on GPU host
3333
// code.
34-
class LowerGpuOpsToROCDLOpsPass : public ModulePass<LowerGpuOpsToROCDLOpsPass> {
34+
class LowerGpuOpsToROCDLOpsPass
35+
: public OperationPass<LowerGpuOpsToROCDLOpsPass, gpu::GPUModuleOp> {
3536
public:
36-
void runOnModule() override {
37-
ModuleOp m = getModule();
38-
if (!m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
39-
return;
37+
void runOnOperation() override {
38+
gpu::GPUModuleOp m = getOperation();
4039

4140
OwningRewritePatternList patterns;
4241
LLVMTypeConverter converter(m.getContext());
@@ -73,7 +72,8 @@ class LowerGpuOpsToROCDLOpsPass : public ModulePass<LowerGpuOpsToROCDLOpsPass> {
7372

7473
} // anonymous namespace
7574

76-
std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLowerGpuOpsToROCDLOpsPass() {
75+
std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
76+
mlir::createLowerGpuOpsToROCDLOpsPass() {
7777
return std::make_unique<LowerGpuOpsToROCDLOpsPass>();
7878
}
7979

mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,15 @@
1+
set(LLVM_TARGET_DEFINITIONS GPUToSPIRV.td)
2+
mlir_tablegen(GPUToSPIRV.cpp.inc -gen-rewriters)
3+
add_public_tablegen_target(MLIRGPUToSPIRVIncGen)
4+
15
add_llvm_library(MLIRGPUtoSPIRVTransforms
26
ConvertGPUToSPIRV.cpp
37
ConvertGPUToSPIRVPass.cpp
48
)
59

10+
add_dependencies(MLIRGPUtoSPIRVTransforms
11+
MLIRGPUToSPIRVIncGen)
12+
613
target_link_libraries(MLIRGPUtoSPIRVTransforms
714
MLIRGPU
815
MLIRIR

0 commit comments

Comments
 (0)