From 41a9cf5963631ba84e8d1caa9a9c0cfdbcf2fe3b Mon Sep 17 00:00:00 2001 From: ergawy Date: Wed, 18 Dec 2024 02:39:40 -0600 Subject: [PATCH] [flang][OpenMP] Rewrite standalone `loop` directives to `simd` Extends conversion support for `loop` directives. This PR handles standalone `loop` constructs by rewriting them to equivalent `simd` constructs. The reasoning behind that decision is documented in the rewrite function itself. --- .../OpenMP/GenericLoopConversion.cpp | 87 +++++++++++++++++-- flang/test/Lower/OpenMP/loop-directive.f90 | 45 +++++++++- .../generic-loop-rewriting-todo.mlir | 13 --- 3 files changed, 123 insertions(+), 22 deletions(-) diff --git a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp index c3c1f3b2848b8..555601c5e92df 100644 --- a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp @@ -30,19 +30,39 @@ class GenericLoopConversionPattern : public mlir::OpConversionPattern { public: enum class GenericLoopCombinedInfo { - None, + Standalone, TargetTeamsLoop, TargetParallelLoop }; using mlir::OpConversionPattern::OpConversionPattern; + explicit GenericLoopConversionPattern(mlir::MLIRContext *ctx) + : mlir::OpConversionPattern{ctx} { + // Enable rewrite recursion to make sure nested `loop` directives are + // handled. + this->setHasBoundedRewriteRecursion(true); + } + mlir::LogicalResult matchAndRewrite(mlir::omp::LoopOp loopOp, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { assert(mlir::succeeded(checkLoopConversionSupportStatus(loopOp))); - rewriteToDistributeParallelDo(loopOp, rewriter); + GenericLoopCombinedInfo combinedInfo = findGenericLoopCombineInfo(loopOp); + + switch (combinedInfo) { + case GenericLoopCombinedInfo::Standalone: + rewriteToSimdLoop(loopOp, rewriter); + break; + case GenericLoopCombinedInfo::TargetParallelLoop: + llvm_unreachable("not yet implemented: `parallel loop` direcitve"); + break; + case GenericLoopCombinedInfo::TargetTeamsLoop: + rewriteToDistributeParallelDo(loopOp, rewriter); + break; + } + rewriter.eraseOp(loopOp); return mlir::success(); } @@ -52,9 +72,8 @@ class GenericLoopConversionPattern GenericLoopCombinedInfo combinedInfo = findGenericLoopCombineInfo(loopOp); switch (combinedInfo) { - case GenericLoopCombinedInfo::None: - return loopOp.emitError( - "not yet implemented: Standalone `omp loop` directive"); + case GenericLoopCombinedInfo::Standalone: + break; case GenericLoopCombinedInfo::TargetParallelLoop: return loopOp.emitError( "not yet implemented: Combined `omp target parallel loop` directive"); @@ -86,7 +105,7 @@ class GenericLoopConversionPattern static GenericLoopCombinedInfo findGenericLoopCombineInfo(mlir::omp::LoopOp loopOp) { mlir::Operation *parentOp = loopOp->getParentOp(); - GenericLoopCombinedInfo result = GenericLoopCombinedInfo::None; + GenericLoopCombinedInfo result = GenericLoopCombinedInfo::Standalone; if (auto teamsOp = mlir::dyn_cast_if_present(parentOp)) if (mlir::isa_and_present(teamsOp->getParentOp())) @@ -100,6 +119,62 @@ class GenericLoopConversionPattern return result; } + /// Rewrites standalone `loop` directives to equivalent `simd` constructs. + /// The reasoning behind this decision is that according to the spec (version + /// 5.2, section 11.7.1): + /// + /// "If the bind clause is not specified on a construct for which it may be + /// specified and the construct is closely nested inside a teams or parallel + /// construct, the effect is as if binding is teams or parallel. If none of + /// those conditions hold, the binding region is not defined." + /// + /// which means that standalone `loop` directives have undefined binding + /// region. Moreover, the spec says (in the next paragraph): + /// + /// "The specified binding region determines the binding thread set. + /// Specifically, if the binding region is a teams region, then the binding + /// thread set is the set of initial threads that are executing that region + /// while if the binding region is a parallel region, then the binding thread + /// set is the team of threads that are executing that region. If the binding + /// region is not defined, then the binding thread set is the encountering + /// thread." + /// + /// which means that the binding thread set for a standalone `loop` directive + /// is only the encountering thread. + /// + /// Since the encountering thread is the binding thread (set) for a + /// standalone `loop` directive, the best we can do in such case is to "simd" + /// the directive. + void rewriteToSimdLoop(mlir::omp::LoopOp loopOp, + mlir::ConversionPatternRewriter &rewriter) const { + loopOp.emitWarning("Detected standalone OpenMP `loop` directive, the " + "associated loop will be rewritten to `simd`."); + mlir::omp::SimdOperands simdClauseOps; + simdClauseOps.privateVars = loopOp.getPrivateVars(); + + auto privateSyms = loopOp.getPrivateSyms(); + if (privateSyms) + simdClauseOps.privateSyms.assign(privateSyms->begin(), + privateSyms->end()); + + Fortran::common::openmp::EntryBlockArgs simdArgs; + simdArgs.priv.vars = simdClauseOps.privateVars; + + auto simdOp = + rewriter.create(loopOp.getLoc(), simdClauseOps); + mlir::Block *simdBlock = + genEntryBlock(rewriter, simdArgs, simdOp.getRegion()); + + mlir::IRMapping mapper; + mlir::Block &loopBlock = *loopOp.getRegion().begin(); + + for (auto [loopOpArg, simdopArg] : + llvm::zip_equal(loopBlock.getArguments(), simdBlock->getArguments())) + mapper.map(loopOpArg, simdopArg); + + rewriter.clone(*loopOp.begin(), mapper); + } + void rewriteToDistributeParallelDo( mlir::omp::LoopOp loopOp, mlir::ConversionPatternRewriter &rewriter) const { diff --git a/flang/test/Lower/OpenMP/loop-directive.f90 b/flang/test/Lower/OpenMP/loop-directive.f90 index 4b4d640e449ee..9fa0de3bfe171 100644 --- a/flang/test/Lower/OpenMP/loop-directive.f90 +++ b/flang/test/Lower/OpenMP/loop-directive.f90 @@ -11,7 +11,7 @@ subroutine test_no_clauses() integer :: i, j, dummy = 1 - ! CHECK: omp.loop private(@[[I_PRIV]] %{{.*}}#0 -> %[[ARG:.*]] : !fir.ref) { + ! CHECK: omp.simd private(@[[I_PRIV]] %{{.*}}#0 -> %[[ARG:.*]] : !fir.ref) { ! CHECK-NEXT: omp.loop_nest (%[[IV:.*]]) : i32 = (%{{.*}}) to (%{{.*}}) {{.*}} { ! CHECK: %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] ! CHECK: fir.store %[[IV]] to %[[ARG_DECL]]#1 : !fir.ref @@ -27,7 +27,7 @@ subroutine test_no_clauses() ! CHECK-LABEL: func.func @_QPtest_collapse subroutine test_collapse() integer :: i, j, dummy = 1 - ! CHECK: omp.loop private(@{{.*}} %{{.*}}#0 -> %{{.*}}, @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) { + ! CHECK: omp.simd private(@{{.*}} %{{.*}}#0 -> %{{.*}}, @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) { ! CHECK-NEXT: omp.loop_nest (%{{.*}}, %{{.*}}) : i32 {{.*}} { ! CHECK: } ! CHECK: } @@ -43,7 +43,7 @@ subroutine test_collapse() ! CHECK-LABEL: func.func @_QPtest_private subroutine test_private() integer :: i, dummy = 1 - ! CHECK: omp.loop private(@[[DUMMY_PRIV]] %{{.*}}#0 -> %[[DUMMY_ARG:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) { + ! CHECK: omp.simd private(@[[DUMMY_PRIV]] %{{.*}}#0 -> %[[DUMMY_ARG:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) { ! CHECK-NEXT: omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) {{.*}} { ! CHECK: %[[DUMMY_DECL:.*]]:2 = hlfir.declare %[[DUMMY_ARG]] {uniq_name = "_QFtest_privateEdummy"} ! CHECK: %{{.*}} = fir.load %[[DUMMY_DECL]]#0 @@ -100,3 +100,42 @@ subroutine test_bind() end do !$omp end loop end subroutine + +! CHECK-LABEL: func.func @_QPtest_nested_directives +subroutine test_nested_directives + implicit none + integer, parameter :: N = 100000 + integer a(N), b(N), c(N) + integer j,i, num, flag; + num = N + + ! CHECK: omp.teams { + + ! Verify the first `loop` directive was combined with `target teams` into + ! `target teams distribute parallel do`. + ! CHECK: omp.parallel {{.*}} { + ! CHECK: omp.distribute { + ! CHECK: omp.wsloop { + ! CHECK: omp.loop_nest {{.*}} { + + ! Very the second `loop` directive was rewritten to `simd`. + ! CHECK: omp.simd {{.*}} { + ! CHECK: omp.loop_nest {{.*}} { + ! CHECK: } + ! CHECK: } + + ! CHECK: } + ! CHECK: } {omp.composite} + ! CHECK: } {omp.composite} + ! CHECK: } {omp.composite} + ! CHECK: } + !$omp target teams map(to: a,b) map(from: c) + !$omp loop + do j=1,1000 + !$omp loop + do i=1,N + c(i) = a(i) * b(i) + end do + end do + !$omp end target teams +end subroutine diff --git a/flang/test/Transforms/generic-loop-rewriting-todo.mlir b/flang/test/Transforms/generic-loop-rewriting-todo.mlir index 9ea6bf001b668..becd6b8dcb5cb 100644 --- a/flang/test/Transforms/generic-loop-rewriting-todo.mlir +++ b/flang/test/Transforms/generic-loop-rewriting-todo.mlir @@ -1,18 +1,5 @@ // RUN: fir-opt --omp-generic-loop-conversion -verify-diagnostics %s -func.func @_QPtarget_loop() { - %c0 = arith.constant 0 : i32 - %c10 = arith.constant 10 : i32 - %c1 = arith.constant 1 : i32 - // expected-error@below {{not yet implemented: Standalone `omp loop` directive}} - omp.loop { - omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) { - omp.yield - } - } - return -} - func.func @_QPtarget_parallel_loop() { omp.target { omp.parallel {