swiftlang
diff --git a/‎mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
Lines changed: 7 additions & 3 deletions b/‎mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
Lines changed: 7 additions & 3 deletions
diff --git a/‎mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
Lines changed: 5 additions & 2 deletions b/‎mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
Lines changed: 5 additions & 2 deletions
diff --git a/‎mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
Lines changed: 4 additions & 2 deletions b/‎mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
Lines changed: 4 additions & 2 deletions
diff --git a/‎mlir/include/mlir/Dialect/GPU/GPUOps.td
Lines changed: 52 additions & 0 deletions b/‎mlir/include/mlir/Dialect/GPU/GPUOps.td
Lines changed: 52 additions & 0 deletions
diff --git a/‎mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
Lines changed: 9 additions & 12 deletions b/‎mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
Lines changed: 9 additions & 12 deletions
diff --git a/‎mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
Lines changed: 6 additions & 7 deletions b/‎mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
Lines changed: 6 additions & 7 deletions
diff --git a/‎mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
Lines changed: 10 additions & 11 deletions b/‎mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
Lines changed: 10 additions & 11 deletions
diff --git a/‎mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
Lines changed: 6 additions & 6 deletions b/‎mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
Lines changed: 7 additions & 0 deletions b/‎mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt
Lines changed: 7 additions & 0 deletions
@@ -19,12 +19,16 @@ namespace mlir {
 class Location;
 class ModuleOp;
 
+template <typename T> class OpPassBase;
+
+namespace gpu {
+class GPUModuleOp;
+} // namespace gpu
+
 namespace LLVM {
 class LLVMDialect;
 } // namespace LLVM
 
-template <typename T> class OpPassBase;
-
 using OwnedCubin = std::unique_ptr<std::vector<char>>;
 using CubinGenerator =
     std::function<OwnedCubin(const std::string &, Location, StringRef)>;
@@ -38,7 +42,7 @@ using CubinGenerator =
 /// attached as a string attribute named 'nvvm.cubin' to the kernel function.
 /// After the transformation, the body of the kernel function is removed (i.e.,
 /// it is turned into a declaration).
-std::unique_ptr<OpPassBase<ModuleOp>>
+std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
 createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
 
 /// Creates a pass to convert a gpu.launch_func operation into a sequence of
 
@@ -14,15 +14,18 @@ namespace mlir {
 class LLVMTypeConverter;
 class OwningRewritePatternList;
 
-class ModuleOp;
 template <typename OpT> class OpPassBase;
 
+namespace gpu {
+class GPUModuleOp;
+}
+
 /// Collect a set of patterns to convert from the GPU dialect to NVVM.
 void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
                                          OwningRewritePatternList &patterns);
 
 /// Creates a pass that lowers GPU dialect operations to NVVM counterparts.
-std::unique_ptr<OpPassBase<ModuleOp>> createLowerGpuOpsToNVVMOpsPass();
+std::unique_ptr<OpPassBase<gpu::GPUModuleOp>> createLowerGpuOpsToNVVMOpsPass();
 
 } // namespace mlir
 
 
@@ -12,11 +12,13 @@
 
 namespace mlir {
 
-class ModuleOp;
+namespace gpu {
+class GPUModuleOp;
+} // namespace gpu
 template <typename OpT> class OpPassBase;
 
 /// Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
-std::unique_ptr<OpPassBase<ModuleOp>> createLowerGpuOpsToROCDLOpsPass();
+std::unique_ptr<OpPassBase<gpu::GPUModuleOp>> createLowerGpuOpsToROCDLOpsPass();
 
 } // namespace mlir
 
 
@@ -598,4 +598,56 @@ def GPU_BarrierOp : GPU_Op<"barrier"> {
   let printer = [{ p << getOperationName(); }];
 }
 
+def GPU_GPUModuleOp : GPU_Op<"module", [
+  IsolatedFromAbove, SymbolTable, Symbol,
+  SingleBlockImplicitTerminator<"ModuleEndOp">
+]> {
+  let summary = "A top level compilation unit containing code to be run on a GPU.";
+  let description = [{
+    GPU module contains code that is intended to be run on a GPU. A host device
+    can launch this code through a gpu.launc_func that creates a fully
+    qualified symbol through the gpu.module's symbol and a gpu.func symbol
+    contained in the gpu.module.
+
+    The module's top-level scope is modeled by a single region with a single
+    block. GPU modules are required to have a name that is used for symbol
+    resolution by the gpu.launch_func operation.
+
+    Using an op with a region to define a GPU module enables "embedding" GPU
+    modules with SIMT execution models in other dialects in a clean manner and
+    allows filtering of code regions to execute passes on only code intended to
+    or not intended to be run on the separate device.
+
+    ```
+      gpu.module @symbol_name {
+      gpu.func {}
+        ...
+      gpu.module_end
+    }
+
+    ```
+  }];
+  let builders = [OpBuilder<"Builder *builder, OperationState &result, "
+                            "StringRef name">];
+  let parser = [{ return ::parseGPUModuleOp(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+  let regions = (region SizedRegion<1>:$body);
+
+  // We need to ensure the block inside the region is properly terminated;
+  // the auto-generated builders do not guarantee that.
+  let skipDefaultBuilders = 1;
+}
+
+def GPU_ModuleEndOp : GPU_Op<"module_end", [
+  Terminator, HasParent<"GPUModuleOp">
+]> {
+  let summary = "A pseudo op that marks the end of a gpu.module.";
+  let description = [{
+    This op terminates the only block inside the only region of a `gpu.module`.
+  }];
+
+  let parser = [{ return success(); }];
+  let printer = [{ p << getOperationName(); }];
+}
+
 #endif // GPU_OPS
@@ -46,18 +46,15 @@ static constexpr const char *kCubinAnnotation = "nvvm.cubin";
 /// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
 /// GPU binary code, which is then attached as an attribute to the function. The
 /// function body is erased.
-class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
+class GpuKernelToCubinPass
+    : public OperationPass<GpuKernelToCubinPass, gpu::GPUModuleOp> {
 public:
   GpuKernelToCubinPass(
       CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
       : cubinGenerator(cubinGenerator) {}
 
-  void runOnModule() override {
-    ModuleOp module = getModule();
-    if (!module.getAttrOfType<UnitAttr>(
-            gpu::GPUDialect::getKernelModuleAttrName()) ||
-        !module.getName())
-      return;
+  void runOnOperation() override {
+    gpu::GPUModuleOp module = getOperation();
 
     // Make sure the NVPTX target is initialized.
     LLVMInitializeNVPTXTarget();
@@ -71,8 +68,8 @@ class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
 
     // Translate the module to CUBIN and attach the result as attribute to the
     // module.
-    if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
-            *llvmModule, module.getLoc(), *module.getName()))
+    if (auto cubinAttr = translateGPUModuleToCubinAnnotation(
+            *llvmModule, module.getLoc(), module.getName()))
       module.setAttr(kCubinAnnotation, cubinAttr);
     else
       signalPassFailure();
@@ -92,7 +89,7 @@ class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
                                   StringRef name);
 
   /// Translates llvmModule to cubin and returns the result as attribute.
-  StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
+  StringAttr translateGPUModuleToCubinAnnotation(llvm::Module &llvmModule,
                                                  Location loc, StringRef name);
 
   CubinGenerator cubinGenerator;
@@ -149,15 +146,15 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
   return cubinGenerator(ptx, loc, name);
 }
 
-StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
+StringAttr GpuKernelToCubinPass::translateGPUModuleToCubinAnnotation(
     llvm::Module &llvmModule, Location loc, StringRef name) {
   auto cubin = convertModuleToCubin(llvmModule, loc, name);
   if (!cubin)
     return {};
   return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
 }
 
-std::unique_ptr<OpPassBase<ModuleOp>>
+std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
 mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
   return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
 }
 
@@ -132,9 +132,9 @@ class GpuLaunchFuncToCudaCallsPass
 
     // GPU kernel modules are no longer necessary since we have a global
     // constant with the CUBIN data.
-    for (auto m : llvm::make_early_inc_range(getModule().getOps<ModuleOp>()))
-      if (m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
-        m.erase();
+    for (auto m :
+         llvm::make_early_inc_range(getModule().getOps<gpu::GPUModuleOp>()))
+      m.erase();
   }
 
 private:
@@ -343,8 +343,8 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
                                                builder.getI32IntegerAttr(0));
   // Create an LLVM global with CUBIN extracted from the kernel annotation and
   // obtain a pointer to the first byte in it.
-  auto kernelModule =
-      getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
+  auto kernelModule = getModule().lookupSymbol<gpu::GPUModuleOp>(
+      launchOp.getKernelModuleName());
   assert(kernelModule && "expected a kernel module");
 
   auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
@@ -354,8 +354,7 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
     return signalPassFailure();
   }
 
-  assert(kernelModule.getName() && "expected a named module");
-  SmallString<128> nameBuffer(*kernelModule.getName());
+  SmallString<128> nameBuffer(kernelModule.getName());
   nameBuffer.append(kCubinStorageSuffix);
   Value data = LLVM::createGlobalString(
       loc, builder, nameBuffer.str(), cubinAttr.getValue(),
 
@@ -200,7 +200,7 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
     auto type = operand.getType().cast<LLVM::LLVMType>();
 
     // Create shared memory array to store the warp reduction.
-    auto module = operand.getDefiningOp()->getParentOfType<ModuleOp>();
+    auto module = operand.getDefiningOp()->getParentOfType<gpu::GPUModuleOp>();
     assert(module && "op must belong to a module");
     Value sharedMemPtr =
         createSharedMemoryArray(loc, module, type, kWarpSize, rewriter);
@@ -391,10 +391,10 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering {
   }
 
   /// Creates a global array stored in shared memory.
-  Value createSharedMemoryArray(Location loc, ModuleOp module,
+  Value createSharedMemoryArray(Location loc, gpu::GPUModuleOp module,
                                 LLVM::LLVMType elementType, int numElements,
                                 ConversionPatternRewriter &rewriter) const {
-    OpBuilder builder(module.getBodyRegion());
+    OpBuilder builder(module.body());
 
     auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
     StringRef name = "reduce_buffer";
@@ -699,13 +699,11 @@ struct GPUReturnOpLowering : public LLVMOpLowering {
 ///
 /// This pass only handles device code and is not meant to be run on GPU host
 /// code.
-class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> {
+class LowerGpuOpsToNVVMOpsPass
+    : public OperationPass<LowerGpuOpsToNVVMOpsPass, gpu::GPUModuleOp> {
 public:
-  void runOnModule() override {
-    ModuleOp m = getModule();
-    if (!m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
-      return;
-
+  void runOnOperation() override {
+    gpu::GPUModuleOp m = getOperation();
     OwningRewritePatternList patterns;
     NVVMTypeConverter converter(m.getContext());
     populateStdToLLVMConversionPatterns(converter, patterns);
@@ -718,7 +716,7 @@ class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> {
     target.addLegalDialect<LLVM::LLVMDialect>();
     target.addLegalDialect<NVVM::NVVMDialect>();
     // TODO(csigg): Remove once we support replacing non-root ops.
-    target.addLegalOp<gpu::YieldOp>();
+    target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
     if (failed(applyPartialConversion(m, target, patterns, &converter)))
       signalPassFailure();
   }
@@ -750,7 +748,8 @@ void mlir::populateGpuToNVVMConversionPatterns(
                                                "__nv_exp");
 }
 
-std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLowerGpuOpsToNVVMOpsPass() {
+std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
+mlir::createLowerGpuOpsToNVVMOpsPass() {
   return std::make_unique<LowerGpuOpsToNVVMOpsPass>();
 }
 
 
@@ -31,12 +31,11 @@ namespace {
 //
 // This pass only handles device code and is not meant to be run on GPU host
 // code.
-class LowerGpuOpsToROCDLOpsPass : public ModulePass<LowerGpuOpsToROCDLOpsPass> {
+class LowerGpuOpsToROCDLOpsPass
+    : public OperationPass<LowerGpuOpsToROCDLOpsPass, gpu::GPUModuleOp> {
 public:
-  void runOnModule() override {
-    ModuleOp m = getModule();
-    if (!m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
-      return;
+  void runOnOperation() override {
+    gpu::GPUModuleOp m = getOperation();
 
     OwningRewritePatternList patterns;
     LLVMTypeConverter converter(m.getContext());
@@ -73,7 +72,8 @@ class LowerGpuOpsToROCDLOpsPass : public ModulePass<LowerGpuOpsToROCDLOpsPass> {
 
 } // anonymous namespace
 
-std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLowerGpuOpsToROCDLOpsPass() {
+std::unique_ptr<OpPassBase<gpu::GPUModuleOp>>
+mlir::createLowerGpuOpsToROCDLOpsPass() {
   return std::make_unique<LowerGpuOpsToROCDLOpsPass>();
 }
 
 
@@ -1,8 +1,15 @@
+set(LLVM_TARGET_DEFINITIONS GPUToSPIRV.td)
+mlir_tablegen(GPUToSPIRV.cpp.inc -gen-rewriters)
+add_public_tablegen_target(MLIRGPUToSPIRVIncGen)
+
 add_llvm_library(MLIRGPUtoSPIRVTransforms
   ConvertGPUToSPIRV.cpp
   ConvertGPUToSPIRVPass.cpp
   )
 
+add_dependencies(MLIRGPUtoSPIRVTransforms
+  MLIRGPUToSPIRVIncGen)
+
 target_link_libraries(MLIRGPUtoSPIRVTransforms
   MLIRGPU
   MLIRIR