flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp - llvm-project - Git at Google

 //===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "flang/Optimizer/Builder/DirectivesCommon.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/OpenMP/Utils.h"
 #include "flang/Support/OpenMP-utils.h"
 #include "flang/Utils/OpenMP.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/SmallPtrSet.h"

 namespace flangomp {
 #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
 #include "flang/Optimizer/OpenMP/Passes.h.inc"
 } // namespace flangomp

 #define DEBUG_TYPE "do-concurrent-conversion"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")

 namespace {
 namespace looputils {
 /// Stores info needed about the induction/iteration variable for each `do
 /// concurrent` in a loop nest.
 struct InductionVariableInfo {
   InductionVariableInfo(fir::DoConcurrentLoopOp loop,
                         mlir::Value inductionVar) {
     populateInfo(loop, inductionVar);
   }
   /// The operation allocating memory for iteration variable.
   mlir::Operation *iterVarMemDef;
   /// the operation(s) updating the iteration variable with the current
   /// iteration number.
   llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps;

 private:
   /// For the \p doLoop parameter, find the following:
   ///
   /// 1. The operation that declares its iteration variable or allocates memory
   /// for it. For example, give the following loop:
   /// ```
   ///   ...
   ///   %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
   ///   ...
   ///   fir.do_concurrent.loop (%ind_var) = (%lb) to (%ub) step (%s) {
   ///     %ind_var_conv = fir.convert %ind_var : (index) -> i32
   ///     fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
   ///     ...
   ///   }
   /// ```
   ///
   /// This function sets the `iterVarMemDef` member to the `hlfir.declare` op
   /// for `%i`.
   ///
   /// 2. The operation(s) that update the loop's iteration variable from its
   /// induction variable. For the above example, the `indVarUpdateOps` is
   /// populated with the first 2 ops in the loop's body.
   ///
   /// Note: The current implementation is dependent on how flang emits loop
   /// bodies; which is sufficient for the current simple test/use cases. If this
   /// proves to be insufficient, this should be made more generic.
   void populateInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) {
     mlir::Value result = nullptr;

     // Checks if a StoreOp is updating the memref of the loop's iteration
     // variable.
     auto isStoringIV = [&](fir::StoreOp storeOp) {
       // Direct store into the IV memref.
       if (storeOp.getValue() == inductionVar) {
         indVarUpdateOps.push_back(storeOp);
         return true;
       }

       // Indirect store into the IV memref.
       if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
               storeOp.getValue().getDefiningOp())) {
         if (convertOp.getOperand() == inductionVar) {
           indVarUpdateOps.push_back(convertOp);
           indVarUpdateOps.push_back(storeOp);
           return true;
         }
       }

       return false;
     };

     for (mlir::Operation &op : loop) {
       if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
         if (isStoringIV(storeOp)) {
           result = storeOp.getMemref();
           break;
         }
     }

     assert(result != nullptr && result.getDefiningOp() != nullptr);
     iterVarMemDef = result.getDefiningOp();
   }
 };

 using InductionVariableInfos = llvm::SmallVector<InductionVariableInfo>;

 /// Collect the list of values used inside the loop but defined outside of it.
 void collectLoopLiveIns(fir::DoConcurrentLoopOp loop,
                         llvm::SmallVectorImpl<mlir::Value> &liveIns) {
   llvm::SmallDenseSet<mlir::Value> seenValues;
   llvm::SmallPtrSet<mlir::Operation *, 8> seenOps;

   for (auto [lb, ub, st] : llvm::zip_equal(
            loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
     liveIns.push_back(lb);
     liveIns.push_back(ub);
     liveIns.push_back(st);
   }

   mlir::visitUsedValuesDefinedAbove(
       loop.getRegion(), [&](mlir::OpOperand *operand) {
         if (!seenValues.insert(operand->get()).second)
           return;

         mlir::Operation *definingOp = operand->get().getDefiningOp();
         // We want to collect ops corresponding to live-ins only once.
         if (definingOp && !seenOps.insert(definingOp).second)
           return;

         liveIns.push_back(operand->get());
       });

   for (mlir::Value local : loop.getLocalVars())
     liveIns.push_back(local);

   for (mlir::Value reduce : loop.getReduceVars())
     liveIns.push_back(reduce);
 }

 /// Collects values that are local to a loop: "loop-local values". A loop-local
 /// value is one that is used exclusively inside the loop but allocated outside
 /// of it. This usually corresponds to temporary values that are used inside the
 /// loop body for initialzing other variables for example.
 ///
 /// See `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90` for an
 /// example of why we need this.
 ///
 /// \param [in] doLoop - the loop within which the function searches for values
 /// used exclusively inside.
 ///
 /// \param [out] locals - the list of loop-local values detected for \p doLoop.
 void collectLoopLocalValues(fir::DoConcurrentLoopOp loop,
                             llvm::SetVector<mlir::Value> &locals) {
   loop.walk([&](mlir::Operation *op) {
     for (mlir::Value operand : op->getOperands()) {
       if (locals.contains(operand))
         continue;

       bool isLocal = true;

       if (!mlir::isa_and_present<fir::AllocaOp>(operand.getDefiningOp()))
         continue;

       // Values defined inside the loop are not interesting since they do not
       // need to be localized.
       if (loop->isAncestor(operand.getDefiningOp()))
         continue;

       for (auto *user : operand.getUsers()) {
         if (!loop->isAncestor(user)) {
           isLocal = false;
           break;
         }
       }

       if (isLocal)
         locals.insert(operand);
     }
   });
 }

 /// For a "loop-local" value \p local within a loop's scope, localizes that
 /// value within the scope of the parallel region the loop maps to. Towards that
 /// end, this function moves the allocation of \p local within \p allocRegion.
 ///
 /// \param local - the value used exclusively within a loop's scope (see
 /// collectLoopLocalValues).
 ///
 /// \param allocRegion - the parallel region where \p local's allocation will be
 /// privatized.
 ///
 /// \param rewriter - builder used for updating \p allocRegion.
 static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion,
                                    mlir::ConversionPatternRewriter &rewriter) {
   rewriter.moveOpBefore(local.getDefiningOp(), &allocRegion.front().front());
 }
 } // namespace looputils

 class DoConcurrentConversion
     : public mlir::OpConversionPattern<fir::DoConcurrentOp> {
 private:
   struct TargetDeclareShapeCreationInfo {
     // Note: We use `std::vector` (rather than `llvm::SmallVector` as usual) to
     // interface more easily `ShapeShiftOp::getOrigins()` which returns
     // `std::vector`.
     std::vector<mlir::Value> startIndices;
     std::vector<mlir::Value> extents;

     TargetDeclareShapeCreationInfo(mlir::Value liveIn) {
       mlir::Value shape = nullptr;
       mlir::Operation *liveInDefiningOp = liveIn.getDefiningOp();
       auto declareOp =
           mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveInDefiningOp);

       if (declareOp != nullptr)
         shape = declareOp.getShape();

       if (!shape)
         return;

       auto shapeOp =
           mlir::dyn_cast_if_present<fir::ShapeOp>(shape.getDefiningOp());
       auto shapeShiftOp =
           mlir::dyn_cast_if_present<fir::ShapeShiftOp>(shape.getDefiningOp());

       if (!shapeOp && !shapeShiftOp)
         TODO(liveIn.getLoc(),
              "Shapes not defined by `fir.shape` or `fir.shape_shift` op's are"
              "not supported yet.");

       if (shapeShiftOp != nullptr)
         startIndices = shapeShiftOp.getOrigins();

       extents = shapeOp != nullptr
                     ? std::vector<mlir::Value>(shapeOp.getExtents().begin(),
                                                shapeOp.getExtents().end())
                     : shapeShiftOp.getExtents();
     }

     bool isShapedValue() const { return !extents.empty(); }
     bool isShapeShiftedValue() const { return !startIndices.empty(); }
   };

   using LiveInShapeInfoMap =
       llvm::DenseMap<mlir::Value, TargetDeclareShapeCreationInfo>;

 public:
   using mlir::OpConversionPattern<fir::DoConcurrentOp>::OpConversionPattern;

   DoConcurrentConversion(
       mlir::MLIRContext *context, bool mapToDevice,
       llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip,
       mlir::SymbolTable &moduleSymbolTable)
       : OpConversionPattern(context), mapToDevice(mapToDevice),
         concurrentLoopsToSkip(concurrentLoopsToSkip),
         moduleSymbolTable(moduleSymbolTable) {}

   mlir::LogicalResult
   matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     looputils::InductionVariableInfos ivInfos;
     auto loop = mlir::cast<fir::DoConcurrentLoopOp>(
         doLoop.getRegion().back().getTerminator());

     auto indVars = loop.getLoopInductionVars();
     assert(indVars.has_value());

     for (mlir::Value indVar : *indVars)
       ivInfos.emplace_back(loop, indVar);

     llvm::SmallVector<mlir::Value> loopNestLiveIns;
     looputils::collectLoopLiveIns(loop, loopNestLiveIns);
     assert(!loopNestLiveIns.empty());

     llvm::SetVector<mlir::Value> locals;
     looputils::collectLoopLocalValues(loop, locals);

     // We do not want to map "loop-local" values to the device through
     // `omp.map.info` ops. Therefore, we remove them from the list of live-ins.
     loopNestLiveIns.erase(llvm::remove_if(loopNestLiveIns,
                                           [&](mlir::Value liveIn) {
                                             return locals.contains(liveIn);
                                           }),
                           loopNestLiveIns.end());

     mlir::omp::TargetOp targetOp;
     mlir::omp::LoopNestOperands loopNestClauseOps;

     mlir::IRMapping mapper;

     if (mapToDevice) {
       mlir::ModuleOp module = doLoop->getParentOfType<mlir::ModuleOp>();
       bool isTargetDevice =
           llvm::cast<mlir::omp::OffloadModuleInterface>(*module)
               .getIsTargetDevice();

       mlir::omp::TargetOperands targetClauseOps;
       genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps,
                            isTargetDevice ? nullptr : &targetClauseOps);

       LiveInShapeInfoMap liveInShapeInfoMap;
       fir::FirOpBuilder builder(
           rewriter,
           fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));

       for (mlir::Value liveIn : loopNestLiveIns) {
         targetClauseOps.mapVars.push_back(
             genMapInfoOpForLiveIn(builder, liveIn));
         liveInShapeInfoMap.insert(
             {liveIn, TargetDeclareShapeCreationInfo(liveIn)});
       }

       targetOp =
           genTargetOp(doLoop.getLoc(), rewriter, mapper, loopNestLiveIns,
                       targetClauseOps, loopNestClauseOps, liveInShapeInfoMap);
       genTeamsOp(rewriter, loop, mapper);
     }

     mlir::omp::ParallelOp parallelOp =
         genParallelOp(rewriter, loop, ivInfos, mapper);

     // Only set as composite when part of `distribute parallel do`.
     parallelOp.setComposite(mapToDevice);

     if (!mapToDevice)
       genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps);

     for (mlir::Value local : locals)
       looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
                                         rewriter);

     if (mapToDevice)
       genDistributeOp(doLoop.getLoc(), rewriter).setComposite(/*val=*/true);

     auto [loopNestOp, wsLoopOp] =
         genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
                     /*isComposite=*/mapToDevice);

     // `local` region arguments are transferred/cloned from the `do concurrent`
     // loop to the loopnest op when the region is cloned above. Instead, these
     // region arguments should be on the workshare loop's region.
     if (mapToDevice) {
       for (auto [parallelArg, loopNestArg] : llvm::zip_equal(
                parallelOp.getRegion().getArguments(),
                loopNestOp.getRegion().getArguments().slice(
                    loop.getLocalOperandsStart(), loop.getNumLocalOperands())))
         rewriter.replaceAllUsesWith(loopNestArg, parallelArg);

       for (auto [wsloopArg, loopNestArg] : llvm::zip_equal(
                wsLoopOp.getRegion().getArguments(),
                loopNestOp.getRegion().getArguments().slice(
                    loop.getReduceOperandsStart(), loop.getNumReduceOperands())))
         rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
     } else {
       for (auto [wsloopArg, loopNestArg] :
            llvm::zip_equal(wsLoopOp.getRegion().getArguments(),
                            loopNestOp.getRegion().getArguments().drop_front(
                                loopNestClauseOps.loopLowerBounds.size())))
         rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
     }

     for (unsigned i = 0;
          i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
       loopNestOp.getRegion().eraseArgument(
           loopNestClauseOps.loopLowerBounds.size());

     rewriter.setInsertionPoint(doLoop);
     fir::FirOpBuilder builder(
         rewriter,
         fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));

     // Collect iteration variable(s) allocations so that we can move them
     // outside the `fir.do_concurrent` wrapper (before erasing it).
     llvm::SmallVector<mlir::Operation *> opsToMove;
     for (mlir::Operation &op : llvm::drop_end(doLoop))
       opsToMove.push_back(&op);

     mlir::Block *allocBlock = builder.getAllocaBlock();

     for (mlir::Operation *op : llvm::reverse(opsToMove)) {
       rewriter.moveOpBefore(op, allocBlock, allocBlock->begin());
     }

     // Mark `unordered` loops that are not perfectly nested to be skipped from
     // the legality check of the `ConversionTarget` since we are not interested
     // in mapping them to OpenMP.
     loopNestOp->walk([&](fir::DoConcurrentOp doLoop) {
       concurrentLoopsToSkip.insert(doLoop);
     });

     rewriter.eraseOp(doLoop);

     return mlir::success();
   }

 private:
   mlir::omp::ParallelOp
   genParallelOp(mlir::ConversionPatternRewriter &rewriter,
                 fir::DoConcurrentLoopOp loop,
                 looputils::InductionVariableInfos &ivInfos,
                 mlir::IRMapping &mapper) const {
     mlir::omp::ParallelOperands parallelOps;

     if (mapToDevice)
       genPrivatizers(rewriter, mapper, loop, parallelOps);

     mlir::Location loc = loop.getLoc();
     auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc, parallelOps);
     Fortran::common::openmp::EntryBlockArgs parallelArgs;
     parallelArgs.priv.vars = parallelOps.privateVars;
     Fortran::common::openmp::genEntryBlock(rewriter, parallelArgs,
                                            parallelOp.getRegion());
     rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));

     genLoopNestIndVarAllocs(rewriter, ivInfos, mapper);
     return parallelOp;
   }

   void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
                                looputils::InductionVariableInfos &ivInfos,
                                mlir::IRMapping &mapper) const {

     for (auto &indVarInfo : ivInfos)
       genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
   }

   mlir::Operation *
   genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
                             mlir::Operation *indVarMemDef,
                             mlir::IRMapping &mapper) const {
     assert(
         indVarMemDef != nullptr &&
         "Induction variable memdef is expected to have a defining operation.");

     llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
     for (auto operand : indVarMemDef->getOperands())
       indVarDeclareAndAlloc.insert(operand.getDefiningOp());
     indVarDeclareAndAlloc.insert(indVarMemDef);

     mlir::Operation *result;
     for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
       result = rewriter.clone(*opToClone, mapper);

     return result;
   }

   void genLoopNestClauseOps(
       mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
       fir::DoConcurrentLoopOp loop,
       mlir::omp::LoopNestOperands &loopNestClauseOps,
       mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
     assert(loopNestClauseOps.loopLowerBounds.empty() &&
            "Loop nest bounds were already emitted!");

     auto populateBounds = [](mlir::Value var,
                              llvm::SmallVectorImpl<mlir::Value> &bounds) {
       bounds.push_back(var.getDefiningOp()->getResult(0));
     };

     auto hostEvalCapture = [&](mlir::Value var,
                                llvm::SmallVectorImpl<mlir::Value> &bounds) {
       populateBounds(var, bounds);

       // Ensure that loop-nest bounds are evaluated in the host and forwarded to
       // the nested omp constructs when we map to the device.
       if (targetClauseOps)
         targetClauseOps->hostEvalVars.push_back(var);
     };

     for (auto [lb, ub, st] : llvm::zip_equal(
              loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
       hostEvalCapture(lb, loopNestClauseOps.loopLowerBounds);
       hostEvalCapture(ub, loopNestClauseOps.loopUpperBounds);
       hostEvalCapture(st, loopNestClauseOps.loopSteps);
     }

     loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
   }

   std::pair<mlir::omp::LoopNestOp, mlir::omp::WsloopOp>
   genWsLoopOp(mlir::ConversionPatternRewriter &rewriter,
               fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
               const mlir::omp::LoopNestOperands &clauseOps,
               bool isComposite) const {
     mlir::omp::WsloopOperands wsloopClauseOps;
     if (!mapToDevice)
       genPrivatizers(rewriter, mapper, loop, wsloopClauseOps);

     genReductions(rewriter, mapper, loop, wsloopClauseOps);

     auto wsloopOp =
         mlir::omp::WsloopOp::create(rewriter, loop.getLoc(), wsloopClauseOps);
     wsloopOp.setComposite(isComposite);

     Fortran::common::openmp::EntryBlockArgs wsloopArgs;
     wsloopArgs.priv.vars = wsloopClauseOps.privateVars;
     wsloopArgs.reduction.vars = wsloopClauseOps.reductionVars;
     Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs,
                                            wsloopOp.getRegion());

     auto loopNestOp =
         mlir::omp::LoopNestOp::create(rewriter, loop.getLoc(), clauseOps);

     // Clone the loop's body inside the loop nest construct using the
     // mapped values.
     rewriter.cloneRegionBefore(loop.getRegion(), loopNestOp.getRegion(),
                                loopNestOp.getRegion().begin(), mapper);

     rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
     mlir::omp::YieldOp::create(rewriter, loop->getLoc());

     return {loopNestOp, wsloopOp};
   }

   void genBoundsOps(fir::FirOpBuilder &builder, mlir::Value liveIn,
                     mlir::Value rawAddr,
                     llvm::SmallVectorImpl<mlir::Value> &boundsOps) const {
     fir::ExtendedValue extVal =
         hlfir::translateToExtendedValue(rawAddr.getLoc(), builder,
                                         hlfir::Entity{liveIn},
                                         /*contiguousHint=*/
                                         true)
             .first;
     fir::factory::AddrAndBoundsInfo info = fir::factory::getDataOperandBaseAddr(
         builder, rawAddr, /*isOptional=*/false, rawAddr.getLoc());
     boundsOps = fir::factory::genImplicitBoundsOps<mlir::omp::MapBoundsOp,
                                                    mlir::omp::MapBoundsType>(
         builder, info, extVal,
         /*dataExvIsAssumedSize=*/false, rawAddr.getLoc());
   }

   mlir::omp::MapInfoOp genMapInfoOpForLiveIn(fir::FirOpBuilder &builder,
                                              mlir::Value liveIn) const {
     mlir::Value rawAddr = liveIn;
     llvm::StringRef name;

     mlir::Operation *liveInDefiningOp = liveIn.getDefiningOp();
     auto declareOp =
         mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveInDefiningOp);

     if (declareOp != nullptr) {
       // Use the raw address to avoid unboxing `fir.box` values whenever
       // possible. Put differently, if we have access to the direct value memory
       // reference/address, we use it.
       rawAddr = declareOp.getOriginalBase();
       name = declareOp.getUniqName();
     }

     if (!llvm::isa<mlir::omp::PointerLikeType>(rawAddr.getType())) {
       mlir::OpBuilder::InsertionGuard guard(builder);
       builder.setInsertionPointAfter(liveInDefiningOp);
       auto copyVal = builder.createTemporary(liveIn.getLoc(), liveIn.getType());
       builder.createStoreWithConvert(copyVal.getLoc(), liveIn, copyVal);
       rawAddr = copyVal;
     }

     mlir::Type liveInType = liveIn.getType();
     mlir::Type eleType = liveInType;
     if (auto refType = mlir::dyn_cast<fir::ReferenceType>(liveInType))
       eleType = refType.getElementType();

     mlir::omp::ClauseMapFlags mapFlag = mlir::omp::ClauseMapFlags::implicit;
     mlir::omp::VariableCaptureKind captureKind =
         mlir::omp::VariableCaptureKind::ByRef;

     if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
       captureKind = mlir::omp::VariableCaptureKind::ByCopy;
     } else if (!fir::isa_builtin_cptr_type(eleType)) {
       mapFlag |= mlir::omp::ClauseMapFlags::to;
       mapFlag |= mlir::omp::ClauseMapFlags::from;
     }

     llvm::SmallVector<mlir::Value> boundsOps;
     genBoundsOps(builder, liveIn, rawAddr, boundsOps);

     return Fortran::utils::openmp::createMapInfoOp(
         builder, liveIn.getLoc(), rawAddr,
         /*varPtrPtr=*/{}, name.str(), boundsOps,
         /*members=*/{},
         /*membersIndex=*/mlir::ArrayAttr{}, mapFlag, captureKind,
         rawAddr.getType());
   }

   mlir::omp::TargetOp
   genTargetOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
               mlir::IRMapping &mapper, llvm::ArrayRef<mlir::Value> mappedVars,
               mlir::omp::TargetOperands &clauseOps,
               mlir::omp::LoopNestOperands &loopNestClauseOps,
               const LiveInShapeInfoMap &liveInShapeInfoMap) const {
     auto targetOp = mlir::omp::TargetOp::create(rewriter, loc, clauseOps);
     auto argIface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*targetOp);

     mlir::Region &region = targetOp.getRegion();

     llvm::SmallVector<mlir::Type> regionArgTypes;
     llvm::SmallVector<mlir::Location> regionArgLocs;

     for (auto var : llvm::concat<const mlir::Value>(clauseOps.hostEvalVars,
                                                     clauseOps.mapVars)) {
       regionArgTypes.push_back(var.getType());
       regionArgLocs.push_back(var.getLoc());
     }

     rewriter.createBlock(&region, {}, regionArgTypes, regionArgLocs);
     fir::FirOpBuilder builder(
         rewriter,
         fir::getKindMapping(targetOp->getParentOfType<mlir::ModuleOp>()));

     // Within the loop, it is possible that we discover other values that need
     // to be mapped to the target region (the shape info values for arrays, for
     // example). Therefore, the map block args might be extended and resized.
     // Hence, we invoke `argIface.getMapBlockArgs()` every iteration to make
     // sure we access the proper vector of data.
     int idx = 0;
     for (auto [mapInfoOp, mappedVar] :
          llvm::zip_equal(clauseOps.mapVars, mappedVars)) {
       auto miOp = mlir::cast<mlir::omp::MapInfoOp>(mapInfoOp.getDefiningOp());
       hlfir::DeclareOp liveInDeclare =
           genLiveInDeclare(builder, targetOp, argIface.getMapBlockArgs()[idx],
                            miOp, liveInShapeInfoMap.at(mappedVar));
       ++idx;

       // If `mappedVar.getDefiningOp()` is a `fir::BoxAddrOp`, we probably
       // need to "unpack" the box by getting the defining op of it's value.
       // However, we did not hit this case in reality yet so leaving it as a
       // todo for now.
       if (mlir::isa<fir::BoxAddrOp>(mappedVar.getDefiningOp()))
         TODO(mappedVar.getLoc(),
              "Mapped variabled defined by `BoxAddrOp` are not supported yet");

       auto mapHostValueToDevice = [&](mlir::Value hostValue,
                                       mlir::Value deviceValue) {
         if (!llvm::isa<mlir::omp::PointerLikeType>(hostValue.getType()))
           mapper.map(hostValue,
                      builder.loadIfRef(hostValue.getLoc(), deviceValue));
         else
           mapper.map(hostValue, deviceValue);
       };

       mapHostValueToDevice(mappedVar, liveInDeclare.getOriginalBase());

       if (auto origDeclareOp = mlir::dyn_cast_if_present<hlfir::DeclareOp>(
               mappedVar.getDefiningOp()))
         mapHostValueToDevice(origDeclareOp.getBase(), liveInDeclare.getBase());
     }

     for (auto [arg, hostEval] : llvm::zip_equal(argIface.getHostEvalBlockArgs(),
                                                 clauseOps.hostEvalVars))
       mapper.map(hostEval, arg);

     for (unsigned i = 0; i < loopNestClauseOps.loopLowerBounds.size(); ++i) {
       loopNestClauseOps.loopLowerBounds[i] =
           mapper.lookup(loopNestClauseOps.loopLowerBounds[i]);
       loopNestClauseOps.loopUpperBounds[i] =
           mapper.lookup(loopNestClauseOps.loopUpperBounds[i]);
       loopNestClauseOps.loopSteps[i] =
           mapper.lookup(loopNestClauseOps.loopSteps[i]);
     }

     // Check if cloning the bounds introduced any dependency on the outer
     // region. If so, then either clone them as well if they are
     // MemoryEffectFree, or else copy them to a new temporary and add them to
     // the map and block_argument lists and replace their uses with the new
     // temporary.
     Fortran::utils::openmp::cloneOrMapRegionOutsiders(builder, targetOp);
     rewriter.setInsertionPoint(
         mlir::omp::TerminatorOp::create(rewriter, targetOp.getLoc()));

     return targetOp;
   }

   hlfir::DeclareOp genLiveInDeclare(
       fir::FirOpBuilder &builder, mlir::omp::TargetOp targetOp,
       mlir::Value liveInArg, mlir::omp::MapInfoOp liveInMapInfoOp,
       const TargetDeclareShapeCreationInfo &targetShapeCreationInfo) const {
     mlir::Type liveInType = liveInArg.getType();
     std::string liveInName = liveInMapInfoOp.getName().has_value()
                                  ? liveInMapInfoOp.getName().value().str()
                                  : std::string("");
     if (fir::isa_ref_type(liveInType))
       liveInType = fir::unwrapRefType(liveInType);

     mlir::Value shape = [&]() -> mlir::Value {
       if (!targetShapeCreationInfo.isShapedValue())
         return {};

       if (targetShapeCreationInfo.isShapeShiftedValue()) {
         llvm::SmallVector<mlir::Value> shapeShiftOperands;

         size_t shapeIdx = 0;
         for (auto [startIndex, extent] :
              llvm::zip_equal(targetShapeCreationInfo.startIndices,
                              targetShapeCreationInfo.extents)) {
           shapeShiftOperands.push_back(
               Fortran::utils::openmp::mapTemporaryValue(
                   builder, targetOp, startIndex,
                   liveInName + ".start_idx.dim" + std::to_string(shapeIdx)));
           shapeShiftOperands.push_back(
               Fortran::utils::openmp::mapTemporaryValue(
                   builder, targetOp, extent,
                   liveInName + ".extent.dim" + std::to_string(shapeIdx)));
           ++shapeIdx;
         }

         auto shapeShiftType = fir::ShapeShiftType::get(
             builder.getContext(), shapeShiftOperands.size() / 2);
         return fir::ShapeShiftOp::create(builder, liveInArg.getLoc(),
                                          shapeShiftType, shapeShiftOperands);
       }

       llvm::SmallVector<mlir::Value> shapeOperands;
       size_t shapeIdx = 0;
       for (auto extent : targetShapeCreationInfo.extents) {
         shapeOperands.push_back(Fortran::utils::openmp::mapTemporaryValue(
             builder, targetOp, extent,
             liveInName + ".extent.dim" + std::to_string(shapeIdx)));
         ++shapeIdx;
       }

       return fir::ShapeOp::create(builder, liveInArg.getLoc(), shapeOperands);
     }();

     return hlfir::DeclareOp::create(builder, liveInArg.getLoc(), liveInArg,
                                     liveInName, shape);
   }

   mlir::omp::TeamsOp genTeamsOp(mlir::ConversionPatternRewriter &rewriter,
                                 fir::DoConcurrentLoopOp loop,
                                 mlir::IRMapping &mapper) const {
     mlir::omp::TeamsOperands teamsOps;
     genReductions(rewriter, mapper, loop, teamsOps);

     mlir::Location loc = loop.getLoc();
     auto teamsOp = mlir::omp::TeamsOp::create(rewriter, loc, teamsOps);
     Fortran::common::openmp::EntryBlockArgs teamsArgs;
     teamsArgs.reduction.vars = teamsOps.reductionVars;
     Fortran::common::openmp::genEntryBlock(rewriter, teamsArgs,
                                            teamsOp.getRegion());

     rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));

     for (auto [loopVar, teamsArg] : llvm::zip_equal(
              loop.getReduceVars(), teamsOp.getRegion().getArguments())) {
       mapper.map(loopVar, teamsArg);
     }

     return teamsOp;
   }

   mlir::omp::DistributeOp
   genDistributeOp(mlir::Location loc,
                   mlir::ConversionPatternRewriter &rewriter) const {
     auto distOp = mlir::omp::DistributeOp::create(
         rewriter, loc, /*clauses=*/mlir::omp::DistributeOperands{});

     rewriter.createBlock(&distOp.getRegion());
     return distOp;
   }

   void cloneFIRRegionToOMP(mlir::ConversionPatternRewriter &rewriter,
                            mlir::Region &firRegion,
                            mlir::Region &ompRegion) const {
     if (!firRegion.empty()) {
       rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
       auto firYield =
           mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
       rewriter.setInsertionPoint(firYield);
       mlir::omp::YieldOp::create(rewriter, firYield.getLoc(),
                                  firYield.getOperands());
       rewriter.eraseOp(firYield);
     }
   }

   /// Generate bodies of OpenMP privatizers by cloning the bodies of FIR
   /// privatizers.
   ///
   /// \param [in] rewriter - used to driver IR generation for privatizers.
   /// \param [in] mapper - value mapping from FIR to OpenMP constructs.
   /// \param [in] loop - FIR loop to convert its localizers.
   ///
   /// \param [out] privateClauseOps - OpenMP privatizers to gen their bodies.
   void genPrivatizers(mlir::ConversionPatternRewriter &rewriter,
                       mlir::IRMapping &mapper, fir::DoConcurrentLoopOp loop,
                       mlir::omp::PrivateClauseOps &privateClauseOps) const {
     // For `local` (and `local_init`) operands, emit corresponding `private`
     // clauses and attach these clauses to the workshare loop.
     if (!loop.getLocalVars().empty())
       for (auto [var, sym, arg] : llvm::zip_equal(
                loop.getLocalVars(),
                loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
                loop.getRegionLocalArgs())) {
         auto localizer = moduleSymbolTable.lookup<fir::LocalitySpecifierOp>(
             sym.getLeafReference());
         if (localizer.getLocalitySpecifierType() ==
             fir::LocalitySpecifierType::LocalInit)
           TODO(localizer.getLoc(),
                "local_init conversion is not supported yet");

         mlir::OpBuilder::InsertionGuard guard(rewriter);
         rewriter.setInsertionPointAfter(localizer);

         auto privatizer = mlir::omp::PrivateClauseOp::create(
             rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp",
             localizer.getTypeAttr().getValue(),
             mlir::omp::DataSharingClauseType::Private);

         cloneFIRRegionToOMP(rewriter, localizer.getInitRegion(),
                             privatizer.getInitRegion());
         cloneFIRRegionToOMP(rewriter, localizer.getDeallocRegion(),
                             privatizer.getDeallocRegion());

         moduleSymbolTable.insert(privatizer);

         privateClauseOps.privateVars.push_back(mapToDevice ? mapper.lookup(var)
                                                            : var);
         privateClauseOps.privateSyms.push_back(
             mlir::SymbolRefAttr::get(privatizer));
       }
   }

   void genReductions(mlir::ConversionPatternRewriter &rewriter,
                      mlir::IRMapping &mapper, fir::DoConcurrentLoopOp loop,
                      mlir::omp::ReductionClauseOps &reductionClauseOps) const {
     if (!loop.getReduceVars().empty()) {
       for (auto [var, byRef, sym, arg] : llvm::zip_equal(
                loop.getReduceVars(), loop.getReduceByrefAttr().asArrayRef(),
                loop.getReduceSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
                loop.getRegionReduceArgs())) {
         auto firReducer = moduleSymbolTable.lookup<fir::DeclareReductionOp>(
             sym.getLeafReference());

         mlir::OpBuilder::InsertionGuard guard(rewriter);
         rewriter.setInsertionPointAfter(firReducer);
         std::string ompReducerName = sym.getLeafReference().str() + ".omp";

         auto ompReducer =
             moduleSymbolTable.lookup<mlir::omp::DeclareReductionOp>(
                 rewriter.getStringAttr(ompReducerName));

         if (!ompReducer) {
           ompReducer = mlir::omp::DeclareReductionOp::create(
               rewriter, firReducer.getLoc(), ompReducerName,
               firReducer.getTypeAttr().getValue(),
               firReducer.getByrefElementTypeAttr());

           cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(),
                               ompReducer.getAllocRegion());
           cloneFIRRegionToOMP(rewriter, firReducer.getInitializerRegion(),
                               ompReducer.getInitializerRegion());
           cloneFIRRegionToOMP(rewriter, firReducer.getReductionRegion(),
                               ompReducer.getReductionRegion());
           cloneFIRRegionToOMP(rewriter, firReducer.getAtomicReductionRegion(),
                               ompReducer.getAtomicReductionRegion());
           cloneFIRRegionToOMP(rewriter, firReducer.getCleanupRegion(),
                               ompReducer.getCleanupRegion());
           moduleSymbolTable.insert(ompReducer);
         }

         reductionClauseOps.reductionVars.push_back(
             mapToDevice ? mapper.lookup(var) : var);
         reductionClauseOps.reductionByref.push_back(byRef);
         reductionClauseOps.reductionSyms.push_back(
             mlir::SymbolRefAttr::get(ompReducer));
       }
     }
   }

   bool mapToDevice;
   llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
   mlir::SymbolTable &moduleSymbolTable;
 };

 /// A listener that forwards notifyOperationErased to the given callback.
 struct CallbackListener : public mlir::RewriterBase::Listener {
   CallbackListener(std::function<void(mlir::Operation *op)> onOperationErased)
       : onOperationErased(onOperationErased) {}

   void notifyOperationErased(mlir::Operation *op) override {
     onOperationErased(op);
   }

   std::function<void(mlir::Operation *op)> onOperationErased;
 };

 class DoConcurrentConversionPass
     : public flangomp::impl::DoConcurrentConversionPassBase<
           DoConcurrentConversionPass> {
 public:
   DoConcurrentConversionPass() = default;

   DoConcurrentConversionPass(
       const flangomp::DoConcurrentConversionPassOptions &options)
       : DoConcurrentConversionPassBase(options) {}

   void runOnOperation() override {
     mlir::ModuleOp module = getOperation();
     mlir::MLIRContext *context = &getContext();
     mlir::SymbolTable moduleSymbolTable(module);

     if (mapTo != flangomp::DoConcurrentMappingKind::DCMK_Host &&
         mapTo != flangomp::DoConcurrentMappingKind::DCMK_Device) {
       mlir::emitWarning(mlir::UnknownLoc::get(context),
                         "DoConcurrentConversionPass: invalid `map-to` value. "
                         "Valid values are: `host` or `device`");
       return;
     }

     llvm::DenseSet<fir::DoConcurrentOp> concurrentLoopsToSkip;
     CallbackListener callbackListener([&](mlir::Operation *op) {
       if (auto loop = mlir::dyn_cast<fir::DoConcurrentOp>(op))
         concurrentLoopsToSkip.erase(loop);
     });
     mlir::RewritePatternSet patterns(context);
     patterns.insert<DoConcurrentConversion>(
         context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
         concurrentLoopsToSkip, moduleSymbolTable);
     mlir::ConversionTarget target(*context);
     target.addDynamicallyLegalOp<fir::DoConcurrentOp>(
         [&](fir::DoConcurrentOp op) {
           return concurrentLoopsToSkip.contains(op);
         });
     target.markUnknownOpDynamicallyLegal(
         [](mlir::Operation *) { return true; });

     mlir::ConversionConfig config;
     config.allowPatternRollback = false;
     config.listener = &callbackListener;
     if (mlir::failed(mlir::applyFullConversion(module, target,
                                                std::move(patterns), config))) {
       signalPassFailure();
     }
   }
 };
 } // namespace

 std::unique_ptr<mlir::Pass>
 flangomp::createDoConcurrentConversionPass(bool mapToDevice) {
   DoConcurrentConversionPassOptions options;
   options.mapTo = mapToDevice ? flangomp::DoConcurrentMappingKind::DCMK_Device
                               : flangomp::DoConcurrentMappingKind::DCMK_Host;

   return std::make_unique<DoConcurrentConversionPass>(options);
 }