| //===- LowerWorkshare.cpp - special cases for bufferization -------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements the lowering of omp.workshare to other omp constructs. |
| // |
| // This pass is tasked with parallelizing the loops nested in |
| // workshare.loop_wrapper while both the Fortran to mlir lowering and the hlfir |
| // to fir lowering pipelines are responsible for emitting the |
| // workshare.loop_wrapper ops where appropriate according to the |
| // `shouldUseWorkshareLowering` function. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include <flang/Optimizer/Builder/FIRBuilder.h> |
| #include <flang/Optimizer/Dialect/FIROps.h> |
| #include <flang/Optimizer/Dialect/FIRType.h> |
| #include <flang/Optimizer/HLFIR/HLFIROps.h> |
| #include <flang/Optimizer/OpenMP/Passes.h> |
| #include <llvm/ADT/BreadthFirstIterator.h> |
| #include <llvm/ADT/STLExtras.h> |
| #include <llvm/ADT/SmallVectorExtras.h> |
| #include <llvm/ADT/iterator_range.h> |
| #include <llvm/Support/ErrorHandling.h> |
| #include <mlir/Dialect/Arith/IR/Arith.h> |
| #include <mlir/Dialect/LLVMIR/LLVMTypes.h> |
| #include <mlir/Dialect/OpenMP/OpenMPClauseOperands.h> |
| #include <mlir/Dialect/OpenMP/OpenMPDialect.h> |
| #include <mlir/Dialect/SCF/IR/SCF.h> |
| #include <mlir/IR/BuiltinOps.h> |
| #include <mlir/IR/IRMapping.h> |
| #include <mlir/IR/OpDefinition.h> |
| #include <mlir/IR/PatternMatch.h> |
| #include <mlir/IR/Value.h> |
| #include <mlir/IR/Visitors.h> |
| #include <mlir/Interfaces/SideEffectInterfaces.h> |
| #include <mlir/Support/LLVM.h> |
| |
| #include <variant> |
| |
| namespace flangomp { |
| #define GEN_PASS_DEF_LOWERWORKSHARE |
| #include "flang/Optimizer/OpenMP/Passes.h.inc" |
| } // namespace flangomp |
| |
| #define DEBUG_TYPE "lower-workshare" |
| |
| using namespace mlir; |
| |
| namespace flangomp { |
| |
| // Checks for nesting pattern below as we need to avoid sharing the work of |
| // statements which are nested in some constructs such as omp.critical or |
| // another omp.parallel. |
| // |
| // omp.workshare { // `wsOp` |
| // ... |
| // omp.T { // `parent` |
| // ... |
| // `op` |
| // |
| template <typename T> |
| static bool isNestedIn(omp::WorkshareOp wsOp, Operation *op) { |
| T parent = op->getParentOfType<T>(); |
| if (!parent) |
| return false; |
| return wsOp->isProperAncestor(parent); |
| } |
| |
| bool shouldUseWorkshareLowering(Operation *op) { |
| auto parentWorkshare = op->getParentOfType<omp::WorkshareOp>(); |
| |
| if (!parentWorkshare) |
| return false; |
| |
| if (isNestedIn<omp::CriticalOp>(parentWorkshare, op)) |
| return false; |
| |
| // 2.8.3 workshare Construct |
| // For a parallel construct, the construct is a unit of work with respect to |
| // the workshare construct. The statements contained in the parallel construct |
| // are executed by a new thread team. |
| if (isNestedIn<omp::ParallelOp>(parentWorkshare, op)) |
| return false; |
| |
| // 2.8.2 single Construct |
| // Binding The binding thread set for a single region is the current team. A |
| // single region binds to the innermost enclosing parallel region. |
| // Description Only one of the encountering threads will execute the |
| // structured block associated with the single construct. |
| if (isNestedIn<omp::SingleOp>(parentWorkshare, op)) |
| return false; |
| |
| // Do not use workshare lowering until we support CFG in omp.workshare |
| if (parentWorkshare.getRegion().getBlocks().size() != 1) |
| return false; |
| |
| return true; |
| } |
| |
| } // namespace flangomp |
| |
| namespace { |
| |
| struct SingleRegion { |
| Block::iterator begin, end; |
| }; |
| |
| static bool mustParallelizeOp(Operation *op) { |
| return op |
| ->walk([&](Operation *nested) { |
| // We need to be careful not to pick up workshare.loop_wrapper in nested |
| // omp.parallel{omp.workshare} regions, i.e. make sure that `nested` |
| // binds to the workshare region we are currently handling. |
| // |
| // For example: |
| // |
| // omp.parallel { |
| // omp.workshare { // currently handling this |
| // omp.parallel { |
| // omp.workshare { // nested workshare |
| // omp.workshare.loop_wrapper {} |
| // |
| // Therefore, we skip if we encounter a nested omp.workshare. |
| if (isa<omp::WorkshareOp>(nested)) |
| return WalkResult::skip(); |
| if (isa<omp::WorkshareLoopWrapperOp>(nested)) |
| return WalkResult::interrupt(); |
| return WalkResult::advance(); |
| }) |
| .wasInterrupted(); |
| } |
| |
| static bool isSafeToParallelize(Operation *op) { |
| return isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) || |
| isMemoryEffectFree(op); |
| } |
| |
| /// Simple shallow copies suffice for our purposes in this pass, so we implement |
| /// this simpler alternative to the full fledged `createCopyFunc` in the |
| /// frontend |
| static mlir::func::FuncOp createCopyFunc(mlir::Location loc, mlir::Type varType, |
| fir::FirOpBuilder builder) { |
| mlir::ModuleOp module = builder.getModule(); |
| auto rt = cast<fir::ReferenceType>(varType); |
| mlir::Type eleTy = rt.getEleTy(); |
| std::string copyFuncName = |
| fir::getTypeAsString(eleTy, builder.getKindMap(), "_workshare_copy"); |
| |
| if (auto decl = module.lookupSymbol<mlir::func::FuncOp>(copyFuncName)) |
| return decl; |
| |
| // create function |
| mlir::OpBuilder::InsertionGuard guard(builder); |
| mlir::OpBuilder modBuilder(module.getBodyRegion()); |
| llvm::SmallVector<mlir::Type> argsTy = {varType, varType}; |
| auto funcType = mlir::FunctionType::get(builder.getContext(), argsTy, {}); |
| mlir::func::FuncOp funcOp = |
| mlir::func::FuncOp::create(modBuilder, loc, copyFuncName, funcType); |
| funcOp.setVisibility(mlir::SymbolTable::Visibility::Private); |
| fir::factory::setInternalLinkage(funcOp); |
| builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy, |
| {loc, loc}); |
| builder.setInsertionPointToStart(&funcOp.getRegion().back()); |
| |
| Value loaded = fir::LoadOp::create(builder, loc, funcOp.getArgument(1)); |
| fir::StoreOp::create(builder, loc, loaded, funcOp.getArgument(0)); |
| |
| mlir::func::ReturnOp::create(builder, loc); |
| return funcOp; |
| } |
| |
| static bool isUserOutsideSR(Operation *user, Operation *parentOp, |
| SingleRegion sr) { |
| while (user->getParentOp() != parentOp) |
| user = user->getParentOp(); |
| return sr.begin->getBlock() != user->getBlock() || |
| !(user->isBeforeInBlock(&*sr.end) && sr.begin->isBeforeInBlock(user)); |
| } |
| |
| static bool isTransitivelyUsedOutside(Value v, SingleRegion sr) { |
| Block *srBlock = sr.begin->getBlock(); |
| Operation *parentOp = srBlock->getParentOp(); |
| |
| for (auto &use : v.getUses()) { |
| Operation *user = use.getOwner(); |
| if (isUserOutsideSR(user, parentOp, sr)) |
| return true; |
| |
| // Now we know user is inside `sr`. |
| |
| // Results of nested users cannot be used outside of `sr`. |
| if (user->getBlock() != srBlock) |
| continue; |
| |
| // A non-safe to parallelize operation will be checked for uses outside |
| // separately. |
| if (!isSafeToParallelize(user)) |
| continue; |
| |
| // For safe to parallelize operations, we need to check if there is a |
| // transitive use of `v` through them. |
| for (auto res : user->getResults()) |
| if (isTransitivelyUsedOutside(res, sr)) |
| return true; |
| } |
| return false; |
| } |
| |
| /// We clone pure operations in both the parallel and single blocks. this |
| /// functions cleans them up if they end up with no uses |
| static void cleanupBlock(Block *block) { |
| for (Operation &op : llvm::make_early_inc_range( |
| llvm::make_range(block->rbegin(), block->rend()))) |
| if (isOpTriviallyDead(&op)) |
| op.erase(); |
| } |
| |
| static void parallelizeRegion(Region &sourceRegion, Region &targetRegion, |
| IRMapping &rootMapping, Location loc, |
| mlir::DominanceInfo &di) { |
| OpBuilder rootBuilder(sourceRegion.getContext()); |
| ModuleOp m = sourceRegion.getParentOfType<ModuleOp>(); |
| OpBuilder copyFuncBuilder(m.getBodyRegion()); |
| fir::FirOpBuilder firCopyFuncBuilder(copyFuncBuilder, m); |
| |
| auto mapReloadedValue = |
| [&](Value v, OpBuilder allocaBuilder, OpBuilder singleBuilder, |
| OpBuilder parallelBuilder, IRMapping singleMapping) -> Value { |
| if (auto reloaded = rootMapping.lookupOrNull(v)) |
| return nullptr; |
| Type ty = v.getType(); |
| Value alloc = fir::AllocaOp::create(allocaBuilder, loc, ty); |
| fir::StoreOp::create(singleBuilder, loc, singleMapping.lookup(v), alloc); |
| Value reloaded = fir::LoadOp::create(parallelBuilder, loc, ty, alloc); |
| rootMapping.map(v, reloaded); |
| return alloc; |
| }; |
| |
| auto moveToSingle = |
| [&](SingleRegion sr, OpBuilder allocaBuilder, OpBuilder singleBuilder, |
| OpBuilder parallelBuilder) -> std::pair<bool, SmallVector<Value>> { |
| IRMapping singleMapping = rootMapping; |
| SmallVector<Value> copyPrivate; |
| bool allParallelized = true; |
| |
| for (Operation &op : llvm::make_range(sr.begin, sr.end)) { |
| if (isSafeToParallelize(&op)) { |
| singleBuilder.clone(op, singleMapping); |
| if (llvm::all_of(op.getOperands(), [&](Value opr) { |
| // Either we have already remapped it |
| bool remapped = rootMapping.contains(opr); |
| // Or it is available because it dominates `sr` |
| bool dominates = di.properlyDominates(opr, &*sr.begin); |
| return remapped || dominates; |
| })) { |
| // Safe to parallelize operations which have all operands available in |
| // the root parallel block can be executed there. |
| parallelBuilder.clone(op, rootMapping); |
| } else { |
| // If any operand was not available, it means that there was no |
| // transitive use of a non-safe-to-parallelize operation outside `sr`. |
| // This means that there should be no transitive uses outside `sr` of |
| // `op`. |
| assert(llvm::all_of(op.getResults(), [&](Value v) { |
| return !isTransitivelyUsedOutside(v, sr); |
| })); |
| allParallelized = false; |
| } |
| } else if (auto alloca = dyn_cast<fir::AllocaOp>(&op)) { |
| auto hoisted = |
| cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping)); |
| rootMapping.map(&*alloca, &*hoisted); |
| rootMapping.map(alloca.getResult(), hoisted.getResult()); |
| copyPrivate.push_back(hoisted); |
| allParallelized = false; |
| } else { |
| singleBuilder.clone(op, singleMapping); |
| // Prepare reloaded values for results of operations that cannot be |
| // safely parallelized and which are used after the region `sr`. |
| for (auto res : op.getResults()) { |
| if (isTransitivelyUsedOutside(res, sr)) { |
| auto alloc = mapReloadedValue(res, allocaBuilder, singleBuilder, |
| parallelBuilder, singleMapping); |
| if (alloc) |
| copyPrivate.push_back(alloc); |
| } |
| } |
| allParallelized = false; |
| } |
| } |
| omp::TerminatorOp::create(singleBuilder, loc); |
| return {allParallelized, copyPrivate}; |
| }; |
| |
| for (Block &block : sourceRegion) { |
| Block *targetBlock = rootBuilder.createBlock( |
| &targetRegion, {}, block.getArgumentTypes(), |
| llvm::map_to_vector(block.getArguments(), |
| [](BlockArgument arg) { return arg.getLoc(); })); |
| rootMapping.map(&block, targetBlock); |
| rootMapping.map(block.getArguments(), targetBlock->getArguments()); |
| } |
| |
| auto handleOneBlock = [&](Block &block) { |
| Block &targetBlock = *rootMapping.lookup(&block); |
| rootBuilder.setInsertionPointToStart(&targetBlock); |
| Operation *terminator = block.getTerminator(); |
| SmallVector<std::variant<SingleRegion, Operation *>> regions; |
| |
| auto it = block.begin(); |
| auto getOneRegion = [&]() { |
| if (&*it == terminator) |
| return false; |
| if (mustParallelizeOp(&*it)) { |
| regions.push_back(&*it); |
| it++; |
| return true; |
| } |
| SingleRegion sr; |
| sr.begin = it; |
| while (&*it != terminator && !mustParallelizeOp(&*it)) |
| it++; |
| sr.end = it; |
| assert(sr.begin != sr.end); |
| regions.push_back(sr); |
| return true; |
| }; |
| while (getOneRegion()) |
| ; |
| |
| for (auto [i, opOrSingle] : llvm::enumerate(regions)) { |
| bool isLast = i + 1 == regions.size(); |
| if (std::holds_alternative<SingleRegion>(opOrSingle)) { |
| OpBuilder singleBuilder(sourceRegion.getContext()); |
| Block *singleBlock = new Block(); |
| singleBuilder.setInsertionPointToStart(singleBlock); |
| |
| OpBuilder allocaBuilder(sourceRegion.getContext()); |
| Block *allocaBlock = new Block(); |
| allocaBuilder.setInsertionPointToStart(allocaBlock); |
| |
| OpBuilder parallelBuilder(sourceRegion.getContext()); |
| Block *parallelBlock = new Block(); |
| parallelBuilder.setInsertionPointToStart(parallelBlock); |
| |
| auto [allParallelized, copyprivateVars] = |
| moveToSingle(std::get<SingleRegion>(opOrSingle), allocaBuilder, |
| singleBuilder, parallelBuilder); |
| if (allParallelized) { |
| // The single region was not required as all operations were safe to |
| // parallelize |
| assert(copyprivateVars.empty()); |
| assert(allocaBlock->empty()); |
| delete singleBlock; |
| } else { |
| omp::SingleOperands singleOperands; |
| if (isLast) |
| singleOperands.nowait = rootBuilder.getUnitAttr(); |
| singleOperands.copyprivateVars = copyprivateVars; |
| cleanupBlock(singleBlock); |
| for (auto var : singleOperands.copyprivateVars) { |
| mlir::func::FuncOp funcOp = |
| createCopyFunc(loc, var.getType(), firCopyFuncBuilder); |
| singleOperands.copyprivateSyms.push_back( |
| SymbolRefAttr::get(funcOp)); |
| } |
| omp::SingleOp singleOp = |
| omp::SingleOp::create(rootBuilder, loc, singleOperands); |
| singleOp.getRegion().push_back(singleBlock); |
| targetRegion.front().getOperations().splice( |
| singleOp->getIterator(), allocaBlock->getOperations()); |
| } |
| rootBuilder.getInsertionBlock()->getOperations().splice( |
| rootBuilder.getInsertionPoint(), parallelBlock->getOperations()); |
| delete allocaBlock; |
| delete parallelBlock; |
| } else { |
| auto op = std::get<Operation *>(opOrSingle); |
| if (auto wslw = dyn_cast<omp::WorkshareLoopWrapperOp>(op)) { |
| omp::WsloopOperands wsloopOperands; |
| if (isLast) |
| wsloopOperands.nowait = rootBuilder.getUnitAttr(); |
| auto wsloop = |
| mlir::omp::WsloopOp::create(rootBuilder, loc, wsloopOperands); |
| auto clonedWslw = cast<omp::WorkshareLoopWrapperOp>( |
| rootBuilder.clone(*wslw, rootMapping)); |
| wsloop.getRegion().takeBody(clonedWslw.getRegion()); |
| clonedWslw->erase(); |
| } else { |
| assert(mustParallelizeOp(op)); |
| Operation *cloned = rootBuilder.cloneWithoutRegions(*op, rootMapping); |
| for (auto [region, clonedRegion] : |
| llvm::zip(op->getRegions(), cloned->getRegions())) |
| parallelizeRegion(region, clonedRegion, rootMapping, loc, di); |
| } |
| } |
| } |
| |
| rootBuilder.clone(*block.getTerminator(), rootMapping); |
| }; |
| |
| if (sourceRegion.hasOneBlock()) { |
| handleOneBlock(sourceRegion.front()); |
| } else if (!sourceRegion.empty()) { |
| auto &domTree = di.getDomTree(&sourceRegion); |
| for (auto node : llvm::breadth_first(domTree.getRootNode())) { |
| handleOneBlock(*node->getBlock()); |
| } |
| } |
| |
| for (Block &targetBlock : targetRegion) |
| cleanupBlock(&targetBlock); |
| } |
| |
| /// Lowers workshare to a sequence of single-thread regions and parallel loops |
| /// |
| /// For example: |
| /// |
| /// omp.workshare { |
| /// %a = fir.allocmem |
| /// omp.workshare.loop_wrapper {} |
| /// fir.call Assign %b %a |
| /// fir.freemem %a |
| /// } |
| /// |
| /// becomes |
| /// |
| /// %tmp = fir.alloca |
| /// omp.single copyprivate(%tmp) { |
| /// %a = fir.allocmem |
| /// fir.store %a %tmp |
| /// } |
| /// %a_reloaded = fir.load %tmp |
| /// omp.workshare.loop_wrapper {} |
| /// omp.single { |
| /// fir.call Assign %b %a_reloaded |
| /// fir.freemem %a_reloaded |
| /// } |
| /// |
| /// Note that we allocate temporary memory for values in omp.single's which need |
| /// to be accessed by all threads and broadcast them using single's copyprivate |
| LogicalResult lowerWorkshare(mlir::omp::WorkshareOp wsOp, DominanceInfo &di) { |
| Location loc = wsOp->getLoc(); |
| IRMapping rootMapping; |
| |
| OpBuilder rootBuilder(wsOp); |
| |
| // FIXME Currently, we only support workshare constructs with structured |
| // control flow. The transformation itself supports CFG, however, once we |
| // transform the MLIR region in the omp.workshare, we need to inline that |
| // region in the parent block. We have no guarantees at this point of the |
| // pipeline that the parent op supports CFG (e.g. fir.if), thus this is not |
| // generally possible. The alternative is to put the lowered region in an |
| // operation akin to scf.execute_region, which will get lowered at the same |
| // time when fir ops get lowered to CFG. However, SCF is not registered in |
| // flang so we cannot use it. Remove this requirement once we have |
| // scf.execute_region or an alternative operation available. |
| if (wsOp.getRegion().getBlocks().size() == 1) { |
| // This operation is just a placeholder which will be erased later. We need |
| // it because our `parallelizeRegion` function works on regions and not |
| // blocks. |
| omp::WorkshareOp newOp = |
| omp::WorkshareOp::create(rootBuilder, loc, omp::WorkshareOperands()); |
| if (!wsOp.getNowait()) |
| omp::BarrierOp::create(rootBuilder, loc); |
| |
| parallelizeRegion(wsOp.getRegion(), newOp.getRegion(), rootMapping, loc, |
| di); |
| |
| // Inline the contents of the placeholder workshare op into its parent |
| // block. |
| Block *theBlock = &newOp.getRegion().front(); |
| Operation *term = theBlock->getTerminator(); |
| Block *parentBlock = wsOp->getBlock(); |
| parentBlock->getOperations().splice(newOp->getIterator(), |
| theBlock->getOperations()); |
| assert(term->getNumOperands() == 0); |
| term->erase(); |
| newOp->erase(); |
| wsOp->erase(); |
| } else { |
| // Otherwise just change the operation to an omp.single. |
| |
| wsOp->emitWarning( |
| "omp workshare with unstructured control flow is currently " |
| "unsupported and will be serialized."); |
| |
| // `shouldUseWorkshareLowering` should have guaranteed that there are no |
| // omp.workshare_loop_wrapper's that bind to this omp.workshare. |
| assert(!wsOp->walk([&](Operation *op) { |
| // Nested omp.workshare can have their own |
| // omp.workshare_loop_wrapper's. |
| if (isa<omp::WorkshareOp>(op)) |
| return WalkResult::skip(); |
| if (isa<omp::WorkshareLoopWrapperOp>(op)) |
| return WalkResult::interrupt(); |
| return WalkResult::advance(); |
| }) |
| .wasInterrupted()); |
| |
| omp::SingleOperands operands; |
| operands.nowait = wsOp.getNowaitAttr(); |
| omp::SingleOp newOp = omp::SingleOp::create(rootBuilder, loc, operands); |
| |
| newOp.getRegion().getBlocks().splice(newOp.getRegion().getBlocks().begin(), |
| wsOp.getRegion().getBlocks()); |
| wsOp->erase(); |
| } |
| return success(); |
| } |
| |
| class LowerWorksharePass |
| : public flangomp::impl::LowerWorkshareBase<LowerWorksharePass> { |
| public: |
| void runOnOperation() override { |
| mlir::DominanceInfo &di = getAnalysis<mlir::DominanceInfo>(); |
| getOperation()->walk([&](mlir::omp::WorkshareOp wsOp) { |
| if (failed(lowerWorkshare(wsOp, di))) |
| signalPassFailure(); |
| }); |
| } |
| }; |
| } // namespace |