diff --git a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 new file mode 100644 index 0000000000000..3aa5d04246397 --- /dev/null +++ b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 @@ -0,0 +1,262 @@ +! RUN: %flang_fc1 -fopenmp -emit-llvm %s -o - | FileCheck %s + +! Combinational testing of control flow graph and builder insertion points +! in mlir-to-llvm conversion: +! - mixing multiple delayed privatizations and multiple reductions +! - multiple blocks in the private alloc region +! - private alloc region has to read from the mold variable +! - firstprivate +! - multiple blocks in the private copy region +! - multiple blocks in the reduction init region +! - reduction init region has to read from the mold variable +! - re-used omp.private ops +! - re-used omp.reduction.declare ops +! - unstructured code inside of the parallel region +! - needs private dealloc region, and this has multiple blocks +! - needs reduction cleanup region, and this has multiple blocks + +! This maybe belongs in the mlir tests, but what we are doing here is complex +! enough that I find the kind of minimised mlir code preferred by mlir reviewers +! hard to read without some fortran here for reference. Nothing like this would +! be generated by other upstream users of the MLIR OpenMP dialect. + +subroutine worst_case(a, b, c, d) + real, allocatable :: a(:), b(:), c(:), d(:) + integer i + + !$omp parallel firstprivate(a,b) reduction(+:c,d) + if (sum(a) == 1) stop 1 + !$omp end parallel +end subroutine + +! CHECK-LABEL: define internal void @worst_case_..omp_par +! CHECK-NEXT: omp.par.entry: +! [reduction alloc regions inlined here] +! CHECK: br label %omp.private.latealloc + +! CHECK: omp.private.latealloc: ; preds = %omp.par.entry +! CHECK-NEXT: br label %omp.private.alloc5 + +! CHECK: omp.private.alloc5: ; preds = %omp.private.latealloc +! [begin private alloc for first var] +! [read the length from the mold argument] +! [if it is non-zero...] +! CHECK: br i1 {{.*}}, label %omp.private.alloc6, label %omp.private.alloc7 + +! CHECK: omp.private.alloc7: ; preds = %omp.private.alloc5 +! [finish private alloc for first var with zero extent] +! CHECK: br label %omp.private.alloc8 + +! CHECK: omp.private.alloc8: ; preds = %omp.private.alloc6, %omp.private.alloc7 +! CHECK-NEXT: br label %omp.region.cont4 + +! CHECK: omp.region.cont4: ; preds = %omp.private.alloc8 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.private.alloc + +! CHECK: omp.private.alloc: ; preds = %omp.region.cont4 +! [begin private alloc for first var] +! [read the length from the mold argument] +! [if it is non-zero...] +! CHECK: br i1 %{{.*}}, label %omp.private.alloc1, label %omp.private.alloc2 + +! CHECK: omp.private.alloc2: ; preds = %omp.private.alloc +! [finish private alloc for second var with zero extent] +! CHECK: br label %omp.private.alloc3 + +! CHECK: omp.private.alloc3: ; preds = %omp.private.alloc1, %omp.private.alloc2 +! CHECK-NEXT: br label %omp.region.cont + +! CHECK: omp.region.cont: ; preds = %omp.private.alloc3 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.private.copy + +! CHECK: omp.private.copy: ; preds = %omp.region.cont +! CHECK-NEXT: br label %omp.private.copy10 + +! CHECK: omp.private.copy10: ; preds = %omp.private.copy +! [begin firstprivate copy for first var] +! [read the length, is it non-zero?] +! CHECK: br i1 %{{.*}}, label %omp.private.copy11, label %omp.private.copy12 + +! CHECK: omp.private.copy12: ; preds = %omp.private.copy11, %omp.private.copy10 +! CHECK-NEXT: br label %omp.region.cont9 + +! CHECK: omp.region.cont9: ; preds = %omp.private.copy12 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.private.copy14 + +! CHECK: omp.private.copy14: ; preds = %omp.region.cont9 +! [begin firstprivate copy for second var] +! [read the length, is it non-zero?] +! CHECK: br i1 %{{.*}}, label %omp.private.copy15, label %omp.private.copy16 + +! CHECK: omp.private.copy16: ; preds = %omp.private.copy15, %omp.private.copy14 +! CHECK-NEXT: br label %omp.region.cont13 + +! CHECK: omp.region.cont13: ; preds = %omp.private.copy16 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.reduction.init + +! CHECK: omp.reduction.init: ; preds = %omp.region.cont13 +! [deffered stores for results of reduction alloc regions] +! CHECK: br label %[[VAL_96:.*]] + +! CHECK: omp.reduction.neutral: ; preds = %omp.reduction.init +! [start of reduction initialization region] +! [null check:] +! CHECK: br i1 %{{.*}}, label %omp.reduction.neutral18, label %omp.reduction.neutral19 + +! CHECK: omp.reduction.neutral19: ; preds = %omp.reduction.neutral +! [malloc and assign the default value to the reduction variable] +! CHECK: br label %omp.reduction.neutral20 + +! CHECK: omp.reduction.neutral20: ; preds = %omp.reduction.neutral18, %omp.reduction.neutral19 +! CHECK-NEXT: br label %omp.region.cont17 + +! CHECK: omp.region.cont17: ; preds = %omp.reduction.neutral20 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.reduction.neutral22 + +! CHECK: omp.reduction.neutral22: ; preds = %omp.region.cont17 +! [start of reduction initialization region] +! [null check:] +! CHECK: br i1 %{{.*}}, label %omp.reduction.neutral23, label %omp.reduction.neutral24 + +! CHECK: omp.reduction.neutral24: ; preds = %omp.reduction.neutral22 +! [malloc and assign the default value to the reduction variable] +! CHECK: br label %omp.reduction.neutral25 + +! CHECK: omp.reduction.neutral25: ; preds = %omp.reduction.neutral23, %omp.reduction.neutral24 +! CHECK-NEXT: br label %omp.region.cont21 + +! CHECK: omp.region.cont21: ; preds = %omp.reduction.neutral25 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.par.region + +! CHECK: omp.par.region: ; preds = %omp.region.cont21 +! CHECK-NEXT: br label %omp.par.region27 + +! CHECK: omp.par.region27: ; preds = %omp.par.region +! [call SUM runtime function] +! [if (sum(a) == 1)] +! CHECK: br i1 %{{.*}}, label %omp.par.region28, label %omp.par.region29 + +! CHECK: omp.par.region29: ; preds = %omp.par.region27 +! CHECK-NEXT: br label %omp.region.cont26 + +! CHECK: omp.region.cont26: ; preds = %omp.par.region28, %omp.par.region29 +! [omp parallel region done, call into the runtime to complete reduction] +! CHECK: %[[VAL_233:.*]] = call i32 @__kmpc_reduce( +! CHECK: switch i32 %[[VAL_233]], label %reduce.finalize [ +! CHECK-NEXT: i32 1, label %reduce.switch.nonatomic +! CHECK-NEXT: i32 2, label %reduce.switch.atomic +! CHECK-NEXT: ] + +! CHECK: reduce.switch.atomic: ; preds = %omp.region.cont26 +! CHECK-NEXT: unreachable + +! CHECK: reduce.switch.nonatomic: ; preds = %omp.region.cont26 +! CHECK-NEXT: %[[red_private_value_0:.*]] = load ptr, ptr %{{.*}}, align 8 +! CHECK-NEXT: br label %omp.reduction.nonatomic.body + +! [various blocks implementing the reduction] + +! CHECK: omp.region.cont35: ; preds = +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: call void @__kmpc_end_reduce( +! CHECK-NEXT: br label %reduce.finalize + +! CHECK: reduce.finalize: ; preds = +! CHECK-NEXT: br label %omp.par.pre_finalize + +! CHECK: omp.par.pre_finalize: ; preds = %reduce.finalize +! CHECK-NEXT: %{{.*}} = load ptr, ptr +! CHECK-NEXT: br label %omp.reduction.cleanup + +! CHECK: omp.reduction.cleanup: ; preds = %omp.par.pre_finalize +! [null check] +! CHECK: br i1 %{{.*}}, label %omp.reduction.cleanup41, label %omp.reduction.cleanup42 + +! CHECK: omp.reduction.cleanup42: ; preds = %omp.reduction.cleanup41, %omp.reduction.cleanup +! CHECK-NEXT: br label %omp.region.cont40 + +! CHECK: omp.region.cont40: ; preds = %omp.reduction.cleanup42 +! CHECK-NEXT: %{{.*}} = load ptr, ptr +! CHECK-NEXT: br label %omp.reduction.cleanup44 + +! CHECK: omp.reduction.cleanup44: ; preds = %omp.region.cont40 +! [null check] +! CHECK: br i1 %{{.*}}, label %omp.reduction.cleanup45, label %omp.reduction.cleanup46 + +! CHECK: omp.reduction.cleanup46: ; preds = %omp.reduction.cleanup45, %omp.reduction.cleanup44 +! CHECK-NEXT: br label %omp.region.cont43 + +! CHECK: omp.region.cont43: ; preds = %omp.reduction.cleanup46 +! CHECK-NEXT: br label %omp.private.dealloc + +! CHECK: omp.private.dealloc: ; preds = %omp.region.cont43 +! [null check] +! CHECK: br i1 %{{.*}}, label %omp.private.dealloc48, label %omp.private.dealloc49 + +! CHECK: omp.private.dealloc49: ; preds = %omp.private.dealloc48, %omp.private.dealloc +! CHECK-NEXT: br label %omp.region.cont47 + +! CHECK: omp.region.cont47: ; preds = %omp.private.dealloc49 +! CHECK-NEXT: br label %omp.private.dealloc51 + +! CHECK: omp.private.dealloc51: ; preds = %omp.region.cont47 +! [null check] +! CHECK: br i1 %{{.*}}, label %omp.private.dealloc52, label %omp.private.dealloc53 + +! CHECK: omp.private.dealloc53: ; preds = %omp.private.dealloc52, %omp.private.dealloc51 +! CHECK-NEXT: br label %omp.region.cont50 + +! CHECK: omp.region.cont50: ; preds = %omp.private.dealloc53 +! CHECK-NEXT: br label %omp.par.outlined.exit.exitStub + +! CHECK: omp.private.dealloc52: ; preds = %omp.private.dealloc51 +! [dealloc memory] +! CHECK: br label %omp.private.dealloc53 + +! CHECK: omp.private.dealloc48: ; preds = %omp.private.dealloc +! [dealloc memory] +! CHECK: br label %omp.private.dealloc49 + +! CHECK: omp.reduction.cleanup45: ; preds = %omp.reduction.cleanup44 +! CHECK-NEXT: call void @free( +! CHECK-NEXT: br label %omp.reduction.cleanup46 + +! CHECK: omp.reduction.cleanup41: ; preds = %omp.reduction.cleanup +! CHECK-NEXT: call void @free( +! CHECK-NEXT: br label %omp.reduction.cleanup42 + +! CHECK: omp.par.region28: ; preds = %omp.par.region27 +! CHECK-NEXT: call {} @_FortranAStopStatement + +! CHECK: omp.reduction.neutral23: ; preds = %omp.reduction.neutral22 +! [source length was zero: finish initializing array] +! CHECK: br label %omp.reduction.neutral25 + +! CHECK: omp.reduction.neutral18: ; preds = %omp.reduction.neutral +! [source length was zero: finish initializing array] +! CHECK: br label %omp.reduction.neutral20 + +! CHECK: omp.private.copy15: ; preds = %omp.private.copy14 +! [source length was non-zero: call assign runtime] +! CHECK: br label %omp.private.copy16 + +! CHECK: omp.private.copy11: ; preds = %omp.private.copy10 +! [source length was non-zero: call assign runtime] +! CHECK: br label %omp.private.copy12 + +! CHECK: omp.private.alloc1: ; preds = %omp.private.alloc +! [var extent was non-zero: malloc a private array] +! CHECK: br label %omp.private.alloc3 + +! CHECK: omp.private.alloc6: ; preds = %omp.private.alloc5 +! [var extent was non-zero: malloc a private array] +! CHECK: br label %omp.private.alloc8 + +! CHECK: omp.par.outlined.exit.exitStub: ; preds = %omp.region.cont50 +! CHECK-NEXT: ret void diff --git a/flang/test/Integration/OpenMP/private-global.f90 b/flang/test/Integration/OpenMP/private-global.f90 new file mode 100644 index 0000000000000..62d0a3faf0c59 --- /dev/null +++ b/flang/test/Integration/OpenMP/private-global.f90 @@ -0,0 +1,46 @@ +!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s + +! Regression test for https://github.com/llvm/llvm-project/issues/106297 + +program bug + implicit none + integer :: table(10) + !$OMP PARALLEL PRIVATE(table) + table = 50 + if (any(table/=50)) then + stop 'fail 3' + end if + !$OMP END PARALLEL + print *,'ok' +End Program + + +! CHECK-LABEL: define internal void {{.*}}..omp_par( +! CHECK: omp.par.entry: +! CHECK: %[[VAL_9:.*]] = alloca i32, align 4 +! CHECK: %[[VAL_10:.*]] = load i32, ptr %[[VAL_11:.*]], align 4 +! CHECK: store i32 %[[VAL_10]], ptr %[[VAL_9]], align 4 +! CHECK: %[[VAL_12:.*]] = load i32, ptr %[[VAL_9]], align 4 +! CHECK: %[[PRIV_TABLE:.*]] = alloca [10 x i32], i64 1, align 4 +! ... +! check that we use the private copy of table for the assignment +! CHECK: omp.par.region1: +! CHECK: %[[ELEMENTAL_TMP:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8 +! CHECK: %[[TABLE_BOX_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8 +! CHECK: %[[BOXED_FIFTY:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +! CHECK: %[[TABLE_BOX_ADDR2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 +! CHECK: %[[TABLE_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] {{\[\[}}3 x i64] [i64 1, i64 10, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)]] }, ptr %[[PRIV_TABLE]], 0 +! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL]], ptr %[[TABLE_BOX_ADDR]], align 8 +! CHECK: %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[TABLE_BOX_ADDR]], align 8 +! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8 +! CHECK: %[[VAL_26:.*]] = call {} @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9) +! ... +! check that we use the private copy of table for table/=50 +! CHECK: omp.par.region3: +! CHECK: %[[VAL_44:.*]] = sub nsw i64 %{{.*}}, 1 +! CHECK: %[[VAL_45:.*]] = mul nsw i64 %[[VAL_44]], 1 +! CHECK: %[[VAL_46:.*]] = mul nsw i64 %[[VAL_45]], 1 +! CHECK: %[[VAL_47:.*]] = add nsw i64 %[[VAL_46]], 0 +! CHECK: %[[VAL_48:.*]] = getelementptr i32, ptr %[[PRIV_TABLE]], i64 %[[VAL_47]] +! CHECK: %[[VAL_49:.*]] = load i32, ptr %[[VAL_48]], align 4 +! CHECK: %[[VAL_50:.*]] = icmp ne i32 %[[VAL_49]], 50 diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 816c7ff9509d2..f3d54ee5d572c 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -371,20 +371,46 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder, return success(); } -/// Populates `reductions` with reduction declarations used in the given loop. +/// Looks up from the operation from and returns the PrivateClauseOp with +/// name symbolName +static omp::PrivateClauseOp findPrivatizer(Operation *from, + SymbolRefAttr symbolName) { + omp::PrivateClauseOp privatizer = + SymbolTable::lookupNearestSymbolFrom(from, + symbolName); + assert(privatizer && "privatizer not found in the symbol table"); + return privatizer; +} + +/// Populates `privatizations` with privatization declarations used for the +/// given op. +/// TODO: generalise beyond ParallelOp +static void collectPrivatizationDecls( + omp::ParallelOp op, SmallVectorImpl &privatizations) { + std::optional attr = op.getPrivateSyms(); + if (!attr) + return; + + privatizations.reserve(privatizations.size() + attr->size()); + for (auto symbolRef : attr->getAsRange()) { + privatizations.push_back(findPrivatizer(op, symbolRef)); + } +} + +/// Populates `reductions` with reduction declarations used in the given op. template static void -collectReductionDecls(T loop, +collectReductionDecls(T op, SmallVectorImpl &reductions) { - std::optional attr = loop.getReductionSyms(); + std::optional attr = op.getReductionSyms(); if (!attr) return; - reductions.reserve(reductions.size() + loop.getNumReductionVars()); + reductions.reserve(reductions.size() + op.getNumReductionVars()); for (auto symbolRef : attr->getAsRange()) { reductions.push_back( SymbolTable::lookupNearestSymbolFrom( - loop, symbolRef)); + op, symbolRef)); } } @@ -609,7 +635,7 @@ static LogicalResult allocReductionVars(T loop, ArrayRef reductionArgs, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, - llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, + const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, SmallVectorImpl &reductionDecls, SmallVectorImpl &privateReductionVariables, DenseMap &reductionVariableMap, @@ -1317,76 +1343,11 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, privateReductionVariables, isByRef); } -/// A RAII class that on construction replaces the region arguments of the -/// parallel op (which correspond to private variables) with the actual private -/// variables they correspond to. This prepares the parallel op so that it -/// matches what is expected by the OMPIRBuilder. -/// -/// On destruction, it restores the original state of the operation so that on -/// the MLIR side, the op is not affected by conversion to LLVM IR. -class OmpParallelOpConversionManager { -public: - OmpParallelOpConversionManager(omp::ParallelOp opInst) - : region(opInst.getRegion()), - privateBlockArgs(cast(*opInst) - .getPrivateBlockArgs()), - privateVars(opInst.getPrivateVars()) { - for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars)) - mlir::replaceAllUsesInRegionWith(blockArg, var, region); - } - - ~OmpParallelOpConversionManager() { - for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars)) - mlir::replaceAllUsesInRegionWith(var, blockArg, region); - } - -private: - Region ®ion; - llvm::MutableArrayRef privateBlockArgs; - OperandRange privateVars; -}; - -// Looks up from the operation from and returns the PrivateClauseOp with -// name symbolName -static omp::PrivateClauseOp findPrivatizer(Operation *from, - SymbolRefAttr symbolName) { - omp::PrivateClauseOp privatizer = - SymbolTable::lookupNearestSymbolFrom(from, - symbolName); - assert(privatizer && "privatizer not found in the symbol table"); - return privatizer; -} -// clones the given privatizer. The original privatizer is used as -// the insert point for the clone. -static omp::PrivateClauseOp -clonePrivatizer(LLVM::ModuleTranslation &moduleTranslation, - omp::PrivateClauseOp privatizer, Operation *fromOperation) { - MLIRContext &context = moduleTranslation.getContext(); - mlir::IRRewriter opCloner(&context); - opCloner.setInsertionPoint(privatizer); - auto clone = - llvm::cast(opCloner.clone(*privatizer)); - - // Unique the clone name to avoid clashes in the symbol table. - unsigned counter = 0; - SmallString<256> cloneName = SymbolTable::generateSymbolName<256>( - privatizer.getSymName(), - [&](llvm::StringRef candidate) { - return SymbolTable::lookupNearestSymbolFrom( - fromOperation, StringAttr::get(&context, candidate)) != - nullptr; - }, - counter); - - clone.setSymName(cloneName); - return clone; -} /// Converts the OpenMP parallel operation to LLVM IR. static LogicalResult convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; - OmpParallelOpConversionManager raii(opInst); ArrayRef isByRef = getIsByRef(opInst.getReductionByref()); assert(isByRef.size() == opInst.getNumReductionVars()); @@ -1395,6 +1356,15 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, LogicalResult bodyGenStatus = success(); llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + // Collect delayed privatization declarations + MutableArrayRef privateBlockArgs = + cast(*opInst).getPrivateBlockArgs(); + SmallVector llvmPrivateVars; + SmallVector privateDecls; + llvmPrivateVars.reserve(privateBlockArgs.size()); + privateDecls.reserve(privateBlockArgs.size()); + collectPrivatizationDecls(opInst, privateDecls); + // Collect reduction declarations SmallVector reductionDecls; collectReductionDecls(opInst, reductionDecls); @@ -1403,6 +1373,66 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, SmallVector deferredStores; auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { + // Allocate private vars + llvm::BranchInst *allocaTerminator = + llvm::cast(allocaIP.getBlock()->getTerminator()); + builder.SetInsertPoint(allocaTerminator); + assert(allocaTerminator->getNumSuccessors() == 1 && + "This is an unconditional branch created by OpenMPIRBuilder"); + llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0); + + // FIXME: Some of the allocation regions do more than just allocating. + // They read from their block argument (amongst other non-alloca things). + // When OpenMPIRBuilder outlines the parallel region into a different + // function it places the loads for live in-values (such as these block + // arguments) at the end of the entry block (because the entry block is + // assumed to contain only allocas). Therefore, if we put these complicated + // alloc blocks in the entry block, these will not dominate the availability + // of the live-in values they are using. Fix this by adding a latealloc + // block after the entry block to put these in (this also helps to avoid + // mixing non-alloca code with allocas). + // Alloc regions which do not use the block argument can still be placed in + // the entry block (therefore keeping the allocas together). + llvm::BasicBlock *privAllocBlock = nullptr; + if (!privateBlockArgs.empty()) + privAllocBlock = splitBB(builder, true, "omp.private.latealloc"); + for (unsigned i = 0; i < privateBlockArgs.size(); ++i) { + Region &allocRegion = privateDecls[i].getAllocRegion(); + + // map allocation region block argument + llvm::Value *nonPrivateVar = + moduleTranslation.lookupValue(opInst.getPrivateVars()[i]); + assert(nonPrivateVar); + moduleTranslation.mapValue(privateDecls[i].getAllocMoldArg(), + nonPrivateVar); + + // in-place convert the private allocation region + SmallVector phis; + if (privateDecls[i].getAllocMoldArg().getUses().empty()) { + // TODO this should use + // allocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca() so it goes before + // the code for fetching the thread id. Not doing this for now to avoid + // test churn. + builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); + } else { + builder.SetInsertPoint(privAllocBlock->getTerminator()); + } + if (failed(inlineConvertOmpRegions(allocRegion, "omp.private.alloc", + builder, moduleTranslation, &phis))) { + bodyGenStatus = failure(); + return; + } + assert(phis.size() == 1 && "expected one allocation to be yielded"); + + moduleTranslation.mapValue(privateBlockArgs[i], phis[0]); + llvmPrivateVars.push_back(phis[0]); + + // clear alloc region block argument mapping in case it needs to be + // re-created with a different source for another use of the same + // reduction decl + moduleTranslation.forgetMapping(allocRegion); + } + // Allocate reduction vars DenseMap reductionVariableMap; @@ -1419,12 +1449,64 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, deferredStores, isByRef))) bodyGenStatus = failure(); + // Apply copy region for firstprivate. + bool needsFirstprivate = + llvm::any_of(privateDecls, [](omp::PrivateClauseOp &privOp) { + return privOp.getDataSharingType() == + omp::DataSharingClauseType::FirstPrivate; + }); + if (needsFirstprivate) { + // Find the end of the allocation blocks + assert(afterAllocas->getSinglePredecessor()); + builder.SetInsertPoint( + afterAllocas->getSinglePredecessor()->getTerminator()); + llvm::BasicBlock *copyBlock = + splitBB(builder, /*CreateBranch=*/true, "omp.private.copy"); + builder.SetInsertPoint(copyBlock->getFirstNonPHIOrDbgOrAlloca()); + } + for (unsigned i = 0; i < privateBlockArgs.size(); ++i) { + if (privateDecls[i].getDataSharingType() != + omp::DataSharingClauseType::FirstPrivate) + continue; + + // copyRegion implements `lhs = rhs` + Region ©Region = privateDecls[i].getCopyRegion(); + + // map copyRegion rhs arg + llvm::Value *nonPrivateVar = + moduleTranslation.lookupValue(opInst.getPrivateVars()[i]); + assert(nonPrivateVar); + moduleTranslation.mapValue(privateDecls[i].getCopyMoldArg(), + nonPrivateVar); + + // map copyRegion lhs arg + moduleTranslation.mapValue(privateDecls[i].getCopyPrivateArg(), + llvmPrivateVars[i]); + + // in-place convert copy region + builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator()); + if (failed(inlineConvertOmpRegions(copyRegion, "omp.private.copy", + builder, moduleTranslation))) { + bodyGenStatus = failure(); + return; + } + + // ignore unused value yielded from copy region + + // clear copy region block argument mapping in case it needs to be + // re-created with different sources for reuse of the same reduction + // decl + moduleTranslation.forgetMapping(copyRegion); + } + // Initialize reduction vars - builder.restoreIP(allocaIP); + builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator()); llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init"); allocaIP = InsertPointTy(allocaIP.getBlock(), allocaIP.getBlock()->getTerminator()->getIterator()); + + builder.restoreIP(allocaIP); SmallVector byRefVars(opInst.getNumReductionVars()); for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) { if (isByRef[i]) { @@ -1534,183 +1616,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, } }; - SmallVector mlirPrivatizerClones; - SmallVector llvmPrivateVars; - - // TODO: Perform appropriate actions according to the data-sharing - // attribute (shared, private, firstprivate, ...) of variables. - // Currently shared and private are supported. - auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP, - llvm::Value &, llvm::Value &llvmOmpRegionInput, - llvm::Value *&llvmReplacementValue) -> InsertPointTy { - llvmReplacementValue = &llvmOmpRegionInput; - - // If this is a private value, this lambda will return the corresponding - // mlir value and its `PrivateClauseOp`. Otherwise, empty values are - // returned. - auto [mlirPrivVar, mlirPrivatizerClone] = - [&]() -> std::pair { - if (!opInst.getPrivateVars().empty()) { - auto mlirPrivVars = opInst.getPrivateVars(); - auto mlirPrivSyms = opInst.getPrivateSyms(); - - // Try to find a privatizer that corresponds to the LLVM value being - // privatized. - for (auto [mlirPrivVar, mlirPrivatizerAttr] : - llvm::zip_equal(mlirPrivVars, *mlirPrivSyms)) { - // Find the MLIR private variable corresponding to the LLVM value - // being privatized. - llvm::Value *mlirToLLVMPrivVar = - moduleTranslation.lookupValue(mlirPrivVar); - - // Check if the LLVM value being privatized matches the LLVM value - // mapped to privVar. In some cases, this is not trivial ... - auto isMatch = [&]() { - if (mlirToLLVMPrivVar == nullptr) - return false; - - // If both values are trivially equal, we found a match. - if (mlirToLLVMPrivVar == &llvmOmpRegionInput) - return true; - - // Otherwise, we check if both llvmOmpRegionInputPtr and - // mlirToLLVMPrivVar refer to the same memory (through a load/store - // pair). This happens if a struct (i.e. multi-field value) is being - // privatized. - // - // For example, if the privatized value is defined by: - // ``` - // %priv_val = alloca { ptr, i64 }, align 8 - // ``` - // - // The initialization of this value (outside the omp region) will be - // something like this: - // - // clang-format off - // ``` - // %partially_init_priv_val = insertvalue { ptr, i64 } undef, - // ptr %some_ptr, 0 - // %fully_init_priv_val = insertvalue { ptr, i64 } %partially_init_priv_val, - // i64 %some_i64, 1 - // ... - // store { ptr, i64 } %fully_init_priv_val, ptr %priv_val, align 8 - // ``` - // clang-format on - // - // Now, we enter the OMP region, in order to access this privatized - // value, we need to load from the allocated memory: - // ``` - // omp.par.entry: - // %priv_val_load = load { ptr, i64 }, ptr %priv_val, align 8 - // ``` - // - // The 2 LLVM values tracked here map as follows: - // - `mlirToLLVMPrivVar` -> `%fully_init_priv_val` - // - `llvmOmpRegionInputPtr` -> `%priv_val_load` - // - // Even though they eventually refer to the same memory reference - // (the memory being privatized), they are 2 distinct LLVM values. - // Therefore, we need to discover their correspondence by finding - // out if they store into and load from the same mem ref. - auto *llvmOmpRegionInputPtrLoad = - llvm::dyn_cast_if_present(&llvmOmpRegionInput); - - if (llvmOmpRegionInputPtrLoad == nullptr) - return false; - - for (auto &use : mlirToLLVMPrivVar->uses()) { - auto *mlirToLLVMPrivVarStore = - llvm::dyn_cast_if_present(use.getUser()); - if (mlirToLLVMPrivVarStore && - (llvmOmpRegionInputPtrLoad->getPointerOperand() == - mlirToLLVMPrivVarStore->getPointerOperand())) - return true; - } - - return false; - }; - - if (!isMatch()) - continue; - - SymbolRefAttr privSym = llvm::cast(mlirPrivatizerAttr); - omp::PrivateClauseOp privatizer = findPrivatizer(opInst, privSym); - - // Clone the privatizer in case it is used by more than one parallel - // region. The privatizer is processed in-place (see below) before it - // gets inlined in the parallel region and therefore processing the - // original op is dangerous. - return {mlirPrivVar, - clonePrivatizer(moduleTranslation, privatizer, opInst)}; - } - } - - return {mlir::Value(), omp::PrivateClauseOp()}; - }(); - - if (mlirPrivVar) { - Region &allocRegion = mlirPrivatizerClone.getAllocRegion(); - - // If this is a `firstprivate` clause, prepare the `omp.private` op by: - if (mlirPrivatizerClone.getDataSharingType() == - omp::DataSharingClauseType::FirstPrivate) { - auto oldAllocBackBlock = std::prev(allocRegion.end()); - omp::YieldOp oldAllocYieldOp = - llvm::cast(oldAllocBackBlock->getTerminator()); - - Region ©Region = mlirPrivatizerClone.getCopyRegion(); - - mlir::IRRewriter copyCloneBuilder(&moduleTranslation.getContext()); - // 1. Cloning the `copy` region to the end of the `alloc` region. - copyCloneBuilder.cloneRegionBefore(copyRegion, allocRegion, - allocRegion.end()); - - auto newCopyRegionFrontBlock = std::next(oldAllocBackBlock); - // 2. Merging the last `alloc` block with the first block in the `copy` - // region clone. - // 3. Re-mapping the first argument of the `copy` region to be the - // argument of the `alloc` region and the second argument of the `copy` - // region to be the yielded value of the `alloc` region (this is the - // private clone of the privatized value). - copyCloneBuilder.mergeBlocks(&*newCopyRegionFrontBlock, - &*oldAllocBackBlock, - {mlirPrivatizerClone.getAllocMoldArg(), - oldAllocYieldOp.getOperand(0)}); - - // 4. The old terminator of the `alloc` region is not needed anymore, so - // delete it. - oldAllocYieldOp.erase(); - } - - // Replace the privatizer block argument with mlir value being privatized. - // This way, the body of the privatizer will be changed from using the - // region/block argument to the value being privatized. - replaceAllUsesInRegionWith(mlirPrivatizerClone.getAllocMoldArg(), - mlirPrivVar, allocRegion); - - auto oldIP = builder.saveIP(); - builder.restoreIP(allocaIP); - - SmallVector yieldedValues; - if (failed(inlineConvertOmpRegions(allocRegion, "omp.privatizer", builder, - moduleTranslation, &yieldedValues))) { - opInst.emitError("failed to inline `alloc` region of an `omp.private` " - "op in the parallel region"); - bodyGenStatus = failure(); - mlirPrivatizerClone.erase(); - } else { - assert(yieldedValues.size() == 1); - llvmReplacementValue = yieldedValues.front(); - - // Keep the LLVM replacement value and the op clone in case we need to - // emit cleanup (i.e. deallocation) logic. - llvmPrivateVars.push_back(llvmReplacementValue); - mlirPrivatizerClones.push_back(mlirPrivatizerClone); - } - - builder.restoreIP(oldIP); - } - + auto privCB = [](InsertPointTy allocaIP, InsertPointTy codeGenIP, + llvm::Value &, llvm::Value &val, llvm::Value *&replVal) { + // tell OpenMPIRBuilder not to do anything. We handled Privatisation in + // bodyGenCB. + replVal = &val; return codeGenIP; }; @@ -1733,8 +1643,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, bodyGenStatus = failure(); SmallVector privateCleanupRegions; - llvm::transform(mlirPrivatizerClones, - std::back_inserter(privateCleanupRegions), + llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions), [](omp::PrivateClauseOp privatizer) { return &privatizer.getDeallocRegion(); }); @@ -1767,9 +1676,6 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB, ifCond, numThreads, pbKind, isCancellable)); - for (mlir::omp::PrivateClauseOp privatizerClone : mlirPrivatizerClones) - privatizerClone.erase(); - return bodyGenStatus; } diff --git a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir index 02ce6b5b19cea..79412fb69f758 100644 --- a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir +++ b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir @@ -74,27 +74,38 @@ llvm.func @parallel_op_firstprivate_multi_block(%arg0: !llvm.ptr) { // CHECK: [[PRIV_BB2]]: // CHECK-NEXT: %[[C1:.*]] = phi i32 [ 1, %[[PRIV_BB1]] ] // CHECK-NEXT: %[[PRIV_ALLOC:.*]] = alloca float, i32 %[[C1]], align 4 -// The entry block of the `copy` region is merged into the exit block of the -// `alloc` region. So check for that. +// CHECK-NEXT: br label %omp.region.cont + +// CHECK: omp.region.cont: +// CHECK-NEXT: %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB2]] ] +// CHECK-NEXT: br label %omp.private.latealloc + +// CHECK: omp.private.latealloc: +// CHECK-NEXT: br label %omp.private.copy + +// CHECK: omp.private.copy: +// CHECK-NEXT: br label %omp.private.copy3 + +// CHECK: omp.private.copy3: // CHECK-NEXT: %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR]], align 4 // CHECK-NEXT: br label %[[PRIV_BB3:.*]] // Check contents of the 2nd block in the `copy` region. // CHECK: [[PRIV_BB3]]: -// CHECK-NEXT: %[[ORIG_VAL2:.*]] = phi float [ %[[ORIG_VAL]], %[[PRIV_BB2]] ] -// CHECK-NEXT: %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB2]] ] -// CHECK-NEXT: store float %[[ORIG_VAL2]], ptr %[[PRIV_ALLOC2]], align 4 +// CHECK-NEXT: %[[ORIG_VAL2:.*]] = phi float [ %[[ORIG_VAL]], %omp.private.copy3 ] +// CHECK-NEXT: %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %omp.private.copy3 ] +// CHECK-NEXT: store float %[[ORIG_VAL2]], ptr %[[PRIV_ALLOC3]], align 4 // CHECK-NEXT: br label %[[PRIV_CONT:.*]] // Check that the privatizer's continuation block yileds the private clone's // address. // CHECK: [[PRIV_CONT]]: -// CHECK-NEXT: %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %[[PRIV_BB3]] ] +// CHECK-NEXT: %[[PRIV_ALLOC4:.*]] = phi ptr [ %[[PRIV_ALLOC3]], %[[PRIV_BB3]] ] // CHECK-NEXT: br label %[[PAR_REG:.*]] // Check that the body of the parallel region loads from the private clone. // CHECK: [[PAR_REG]]: -// CHECK: %{{.*}} = load float, ptr %[[PRIV_ALLOC3]], align 4 +// CHECK: %{{.*}} = load float, ptr %[[PRIV_ALLOC2]], align 4 omp.private {type = firstprivate} @multi_block.privatizer : !llvm.ptr alloc { ^bb0(%arg0: !llvm.ptr): diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir index 6153e5685c29f..5407f97286eb1 100644 --- a/mlir/test/Target/LLVMIR/openmp-private.mlir +++ b/mlir/test/Target/LLVMIR/openmp-private.mlir @@ -104,6 +104,9 @@ llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr) { // CHECK: omp.par.entry: // CHECK: %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %{{.*}}, i32 0, i32 0 // CHECK: %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8 +// CHECK: br label %omp.private.latealloc + +// CHECK: omp.private.latealloc: // CHECK: br label %[[PRIV_BB1:.*]] // Check contents of the first block in the `alloc` region. @@ -151,8 +154,7 @@ omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc { // CHECK: omp.par.region: // CHECK: br label %[[PAR_REG_BEG:.*]] // CHECK: [[PAR_REG_BEG]]: -// CHECK: %[[PRIVATIZER_GEP:.*]] = getelementptr double, ptr @_QQfoo, i64 111 -// CHECK: call void @bar(ptr %[[PRIVATIZER_GEP]]) +// CHECK: call void @bar(ptr getelementptr (double, ptr @_QQfoo, i64 111)) // CHECK: call void @bar(ptr getelementptr (double, ptr @_QQfoo, i64 222)) llvm.func @lower_region_with_addressof() { %0 = llvm.mlir.constant(1 : i64) : i64