-
Notifications
You must be signed in to change notification settings - Fork 12.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP][flang][OpenMP] Experimental pass to map do concurrent
to OMP
#77285
Draft
ergawy
wants to merge
2
commits into
llvm:main
Choose a base branch
from
ergawy:do_concurrent_to_omp
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+289
−1
Draft
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
205 changes: 205 additions & 0 deletions
205
flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "flang/Optimizer/Dialect/FIRDialect.h" | ||
#include "flang/Optimizer/Dialect/FIROps.h" | ||
#include "flang/Optimizer/Dialect/FIRType.h" | ||
#include "flang/Optimizer/Dialect/Support/FIRContext.h" | ||
#include "flang/Optimizer/HLFIR/HLFIRDialect.h" | ||
#include "flang/Optimizer/Transforms/Passes.h" | ||
#include "mlir/Dialect/Func/IR/FuncOps.h" | ||
#include "mlir/Dialect/OpenMP/OpenMPDialect.h" | ||
#include "mlir/IR/Diagnostics.h" | ||
#include "mlir/IR/IRMapping.h" | ||
#include "mlir/Pass/Pass.h" | ||
#include "mlir/Transforms/DialectConversion.h" | ||
|
||
#include <memory> | ||
|
||
namespace fir { | ||
#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS | ||
#include "flang/Optimizer/Transforms/Passes.h.inc" | ||
} // namespace fir | ||
|
||
#define DEBUG_TYPE "fopenmp-do-concurrent-conversion" | ||
|
||
namespace { | ||
class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> { | ||
public: | ||
using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern; | ||
|
||
mlir::LogicalResult | ||
matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor, | ||
mlir::ConversionPatternRewriter &rewriter) const override { | ||
mlir::OpPrintingFlags flags; | ||
flags.printGenericOpForm(); | ||
|
||
mlir::omp::ParallelOp parallelOp = | ||
rewriter.create<mlir::omp::ParallelOp>(doLoop.getLoc()); | ||
|
||
mlir::Block *block = rewriter.createBlock(¶llelOp.getRegion()); | ||
|
||
rewriter.setInsertionPointToEnd(block); | ||
rewriter.create<mlir::omp::TerminatorOp>(doLoop.getLoc()); | ||
|
||
rewriter.setInsertionPointToStart(block); | ||
|
||
// ==== TODO (1) Start ==== | ||
// | ||
// The goal of the few lines below is to collect and clone | ||
// the list of operations that define the loop's lower and upper bounds as | ||
// well as the step. Should we, instead of doing this here, split it into 2 | ||
// stages? | ||
// | ||
// 1. **Stage 1**: add an analysis that extracts all the relevant | ||
// operations defining the lower-bound, upper-bound, and | ||
// step. | ||
// 2. **Stage 2**: clone the collected operations in the parallel region. | ||
// | ||
// So far, the pass has been tested with very simple loops (where the bounds | ||
// and step are constants) so the goal of **Stage 1** is to have a | ||
// well-defined component that has the sole responsibility of collecting all | ||
// the relevant ops relevant to the loop header. This was we can test this | ||
// in isolation for more complex loops and better organize the code. **Stage | ||
// 2** would then be responsible for the actual cloning of the collected | ||
// loop header preparation/allocation operations. | ||
|
||
// Clone the LB, UB, step defining ops inside the parallel region. | ||
llvm::SmallVector<mlir::Value> lowerBound, upperBound, step; | ||
lowerBound.push_back( | ||
rewriter.clone(*doLoop.getLowerBound().getDefiningOp())->getResult(0)); | ||
upperBound.push_back( | ||
rewriter.clone(*doLoop.getUpperBound().getDefiningOp())->getResult(0)); | ||
step.push_back( | ||
rewriter.clone(*doLoop.getStep().getDefiningOp())->getResult(0)); | ||
// ==== TODO (1) End ==== | ||
|
||
auto wsLoopOp = rewriter.create<mlir::omp::WsLoopOp>( | ||
doLoop.getLoc(), lowerBound, upperBound, step); | ||
wsLoopOp.setInclusive(true); | ||
|
||
auto outlineableOp = | ||
mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(*parallelOp); | ||
rewriter.setInsertionPointToStart(outlineableOp.getAllocaBlock()); | ||
|
||
// ==== TODO (2) Start ==== | ||
// | ||
// The goal of the following simple work-list algorithm and | ||
// the following `for` loop is to collect all the operations related to the | ||
// allocation of the induction variable for the `do concurrent` loop. The | ||
// operations collected by this algorithm are very similar to what is | ||
// usually emitted for privatized variables, e.g. for omp.parallel loops. | ||
// Therefore, I think we can: | ||
// | ||
// 1. **Stage 1**: Add an analysis that colects all these operations. The | ||
// goal is similar to **Stage 1** of TODO (1): isolate the | ||
// algorithm is an individually-testable component so that | ||
// we properly implement and test it for more complicated | ||
// `do concurrent` loops. | ||
// 1. **Stage 2**: Using the collected operations, create and populate an | ||
// `omp.private {type=private}` op to server as the | ||
// delayed privatizer for the new work-sharing loop. | ||
|
||
// For the induction variable, we need to privative its allocation and | ||
// binding inside the parallel region. | ||
llvm::SmallSetVector<mlir::Operation *, 2> workList; | ||
// Therefore, we first discover the induction variable by discovering | ||
// `fir.store`s where the source is the loop's block argument. | ||
workList.insert(doLoop.getInductionVar().getUsers().begin(), | ||
doLoop.getInductionVar().getUsers().end()); | ||
llvm::SmallSetVector<fir::StoreOp, 2> inductionVarTargetStores; | ||
|
||
// Walk the def-chain of the loop's block argument until we hit `fir.store`. | ||
while (!workList.empty()) { | ||
mlir::Operation *item = workList.front(); | ||
|
||
if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(item)) { | ||
inductionVarTargetStores.insert(storeOp); | ||
} else { | ||
workList.insert(item->getUsers().begin(), item->getUsers().end()); | ||
} | ||
|
||
workList.remove(item); | ||
} | ||
|
||
// For each collected `fir.sotre`, find the target memref's alloca's and | ||
// declare ops. | ||
llvm::SmallSetVector<mlir::Operation *, 4> declareAndAllocasToClone; | ||
for (auto storeOp : inductionVarTargetStores) { | ||
mlir::Operation *storeTarget = storeOp.getMemref().getDefiningOp(); | ||
|
||
for (auto operand : storeTarget->getOperands()) { | ||
declareAndAllocasToClone.insert(operand.getDefiningOp()); | ||
} | ||
declareAndAllocasToClone.insert(storeTarget); | ||
} | ||
// ==== TODO (2) End ==== | ||
// | ||
// TODO (1 & 2): Isolating analyses proposed in both TODOs, I think we can | ||
// more easily generalize the pass to work for targets other than OpenMP, | ||
// e.g. OpenACC, I think can, can reuse the results of the analyses and only | ||
// change the code-gen/rewriting. | ||
|
||
mlir::IRMapping mapper; | ||
|
||
// Collect the memref defining ops in the parallel region. | ||
for (mlir::Operation *opToClone : declareAndAllocasToClone) { | ||
rewriter.clone(*opToClone, mapper); | ||
} | ||
|
||
// Clone the loop's body inside the worksharing construct using the mapped | ||
// memref values. | ||
rewriter.cloneRegionBefore(doLoop.getRegion(), wsLoopOp.getRegion(), | ||
wsLoopOp.getRegion().begin(), mapper); | ||
|
||
mlir::Operation *terminator = wsLoopOp.getRegion().back().getTerminator(); | ||
rewriter.setInsertionPointToEnd(&wsLoopOp.getRegion().back()); | ||
rewriter.create<mlir::omp::YieldOp>(terminator->getLoc()); | ||
rewriter.eraseOp(terminator); | ||
|
||
rewriter.eraseOp(doLoop); | ||
|
||
return mlir::success(); | ||
} | ||
}; | ||
|
||
class DoConcurrentConversionPass | ||
: public fir::impl::DoConcurrentConversionPassBase< | ||
DoConcurrentConversionPass> { | ||
public: | ||
void runOnOperation() override { | ||
mlir::func::FuncOp func = getOperation(); | ||
|
||
if (func.isDeclaration()) { | ||
return; | ||
} | ||
|
||
auto *context = &getContext(); | ||
mlir::RewritePatternSet patterns(context); | ||
patterns.insert<DoConcurrentConversion>(context); | ||
mlir::ConversionTarget target(*context); | ||
target.addLegalDialect<fir::FIROpsDialect, hlfir::hlfirDialect, | ||
mlir::arith::ArithDialect, mlir::func::FuncDialect, | ||
mlir::omp::OpenMPDialect>(); | ||
|
||
target.addDynamicallyLegalOp<fir::DoLoopOp>( | ||
[](fir::DoLoopOp op) { return !op.getUnordered(); }); | ||
|
||
if (mlir::failed(mlir::applyFullConversion(getOperation(), target, | ||
std::move(patterns)))) { | ||
mlir::emitError(mlir::UnknownLoc::get(context), | ||
"error in converting do-concurrent op"); | ||
signalPassFailure(); | ||
} | ||
} | ||
}; | ||
} // namespace | ||
|
||
std::unique_ptr<mlir::Pass> fir::createDoConcurrentConversionPass() { | ||
return std::make_unique<DoConcurrentConversionPass>(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
// Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. | ||
|
||
// RUN: fir-opt --fopenmp-do-concurrent-conversion %s | FileCheck %s | ||
|
||
// CHECK-LABEL: func.func @do_concurrent_basic | ||
func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} { | ||
// CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) | ||
// CHECK: %[[C1:.*]] = arith.constant 1 : i32 | ||
// CHECK: %[[C10:.*]] = arith.constant 10 : i32 | ||
|
||
%0 = fir.alloca i32 {bindc_name = "i"} | ||
%1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) | ||
%2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>> | ||
%c10 = arith.constant 10 : index | ||
%3 = fir.shape %c10 : (index) -> !fir.shape<1> | ||
%4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) | ||
%c1_i32 = arith.constant 1 : i32 | ||
%7 = fir.convert %c1_i32 : (i32) -> index | ||
%c10_i32 = arith.constant 10 : i32 | ||
%8 = fir.convert %c10_i32 : (i32) -> index | ||
%c1 = arith.constant 1 : index | ||
|
||
// CHECK-NOT: fir.do_loop | ||
|
||
// CHECK: omp.parallel { | ||
|
||
// CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} | ||
// CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) | ||
|
||
// CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index | ||
// CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index | ||
// CHECK: %[[STEP:.*]] = arith.constant 1 : index | ||
|
||
// CHECK: omp.wsloop for (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { | ||
// CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32 | ||
// CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref<i32> | ||
// CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32> | ||
// CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32> | ||
// CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64 | ||
// CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]]) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32> | ||
// CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref<i32> | ||
// CHECK-NEXT: omp.yield | ||
// CHECK-NEXT: } | ||
|
||
// CHECK-NEXT: omp.terminator | ||
// CHECK-NEXT: } | ||
fir.do_loop %arg0 = %7 to %8 step %c1 unordered { | ||
%13 = fir.convert %arg0 : (index) -> i32 | ||
fir.store %13 to %1#1 : !fir.ref<i32> | ||
%14 = fir.load %1#0 : !fir.ref<i32> | ||
%15 = fir.load %1#0 : !fir.ref<i32> | ||
%16 = fir.convert %15 : (i32) -> i64 | ||
%17 = hlfir.designate %4#0 (%16) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32> | ||
hlfir.assign %14 to %17 : i32, !fir.ref<i32> | ||
} | ||
|
||
// CHECK-NOT: fir.do_loop | ||
|
||
return | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.