flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp - llvm-project - Git at Google

 //===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/OpenMP/Utils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"

 namespace flangomp {
 #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
 #include "flang/Optimizer/OpenMP/Passes.h.inc"
 } // namespace flangomp

 #define DEBUG_TYPE "do-concurrent-conversion"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")

 namespace {
 namespace looputils {
 /// Stores info needed about the induction/iteration variable for each `do
 /// concurrent` in a loop nest.
 struct InductionVariableInfo {
   InductionVariableInfo(fir::DoLoopOp doLoop) { populateInfo(doLoop); }

   /// The operation allocating memory for iteration variable.
   mlir::Operation *iterVarMemDef;
   /// the operation(s) updating the iteration variable with the current
   /// iteration number.
   llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps;

 private:
   /// For the \p doLoop parameter, find the following:
   ///
   /// 1. The operation that declares its iteration variable or allocates memory
   /// for it. For example, give the following loop:
   /// ```
   ///   ...
   ///   %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
   ///   ...
   ///   fir.do_loop %ind_var = %lb to %ub step %s unordered {
   ///     %ind_var_conv = fir.convert %ind_var : (index) -> i32
   ///     fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
   ///     ...
   ///   }
   /// ```
   ///
   /// This function sets the `iterVarMemDef` member to the `hlfir.declare` op
   /// for `%i`.
   ///
   /// 2. The operation(s) that update the loop's iteration variable from its
   /// induction variable. For the above example, the `indVarUpdateOps` is
   /// populated with the first 2 ops in the loop's body.
   ///
   /// Note: The current implementation is dependent on how flang emits loop
   /// bodies; which is sufficient for the current simple test/use cases. If this
   /// proves to be insufficient, this should be made more generic.
   void populateInfo(fir::DoLoopOp doLoop) {
     mlir::Value result = nullptr;

     // Checks if a StoreOp is updating the memref of the loop's iteration
     // variable.
     auto isStoringIV = [&](fir::StoreOp storeOp) {
       // Direct store into the IV memref.
       if (storeOp.getValue() == doLoop.getInductionVar()) {
         indVarUpdateOps.push_back(storeOp);
         return true;
       }

       // Indirect store into the IV memref.
       if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
               storeOp.getValue().getDefiningOp())) {
         if (convertOp.getOperand() == doLoop.getInductionVar()) {
           indVarUpdateOps.push_back(convertOp);
           indVarUpdateOps.push_back(storeOp);
           return true;
         }
       }

       return false;
     };

     for (mlir::Operation &op : doLoop) {
       if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
         if (isStoringIV(storeOp)) {
           result = storeOp.getMemref();
           break;
         }
     }

     assert(result != nullptr && result.getDefiningOp() != nullptr);
     iterVarMemDef = result.getDefiningOp();
   }
 };

 using LoopNestToIndVarMap =
     llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;

 /// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
 /// there are no operations in \p outerloop's body other than:
 ///
 /// 1. the operations needed to assign/update \p outerLoop's induction variable.
 /// 2. \p innerLoop itself.
 ///
 /// \p return true if \p innerLoop is perfectly nested inside \p outerLoop
 /// according to the above definition.
 bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
   mlir::ForwardSliceOptions forwardSliceOptions;
   forwardSliceOptions.inclusive = true;
   // The following will be used as an example to clarify the internals of this
   // function:
   // ```
   // 1. fir.do_loop %i_idx = %34 to %36 step %c1 unordered {
   // 2.   %i_idx_2 = fir.convert %i_idx : (index) -> i32
   // 3.   fir.store %i_idx_2 to %i_iv#1 : !fir.ref<i32>
   //
   // 4.   fir.do_loop %j_idx = %37 to %39 step %c1_3 unordered {
   // 5.     %j_idx_2 = fir.convert %j_idx : (index) -> i32
   // 6.     fir.store %j_idx_2 to %j_iv#1 : !fir.ref<i32>
   //        ... loop nest body, possible uses %i_idx ...
   //      }
   //    }
   // ```
   // In this example, the `j` loop is perfectly nested inside the `i` loop and
   // below is how we find that.

   // We don't care about the outer-loop's induction variable's uses within the
   // inner-loop, so we filter out these uses.
   //
   // This filter tells `getForwardSlice` (below) to only collect operations
   // which produce results defined above (i.e. outside) the inner-loop's body.
   //
   // Since `outerLoop.getInductionVar()` is a block argument (to the
   // outer-loop's body), the filter effectively collects uses of
   // `outerLoop.getInductionVar()` inside the outer-loop but outside the
   // inner-loop.
   forwardSliceOptions.filter = [&](mlir::Operation *op) {
     return mlir::areValuesDefinedAbove(op->getResults(), innerLoop.getRegion());
   };

   llvm::SetVector<mlir::Operation *> indVarSlice;
   // The forward slice of the `i` loop's IV will be the 2 ops in line 1 & 2
   // above. Uses of `%i_idx` inside the `j` loop are not collected because of
   // the filter.
   mlir::getForwardSlice(outerLoop.getInductionVar(), &indVarSlice,
                         forwardSliceOptions);
   llvm::DenseSet<mlir::Operation *> indVarSet(indVarSlice.begin(),
                                               indVarSlice.end());

   llvm::DenseSet<mlir::Operation *> outerLoopBodySet;
   // The following walk collects ops inside `outerLoop` that are **not**:
   // * the outer-loop itself,
   // * or the inner-loop,
   // * or the `fir.result` op (the outer-loop's terminator).
   //
   // For the above example, this will also populate `outerLoopBodySet` with ops
   // in line 1 & 2 since we skip the `i` loop, the `j` loop, and the terminator.
   outerLoop.walk<mlir::WalkOrder::PreOrder>([&](mlir::Operation *op) {
     if (op == outerLoop)
       return mlir::WalkResult::advance();

     if (op == innerLoop)
       return mlir::WalkResult::skip();

     if (mlir::isa<fir::ResultOp>(op))
       return mlir::WalkResult::advance();

     outerLoopBodySet.insert(op);
     return mlir::WalkResult::advance();
   });

   // If `outerLoopBodySet` ends up having the same ops as `indVarSet`, then
   // `outerLoop` only contains ops that setup its induction variable +
   // `innerLoop` + the `fir.result` terminator. In other words, `innerLoop` is
   // perfectly nested inside `outerLoop`.
   bool result = (outerLoopBodySet == indVarSet);
   LLVM_DEBUG(DBGS() << "Loop pair starting at location " << outerLoop.getLoc()
                     << " is" << (result ? "" : " not")
                     << " perfectly nested\n");

   return result;
 }

 /// Starting with `currentLoop` collect a perfectly nested loop nest, if any.
 /// This function collects as much as possible loops in the nest; it case it
 /// fails to recognize a certain nested loop as part of the nest it just returns
 /// the parent loops it discovered before.
 mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
                                     LoopNestToIndVarMap &loopNest) {
   assert(currentLoop.getUnordered());

   while (true) {
     loopNest.insert({currentLoop, InductionVariableInfo(currentLoop)});
     llvm::SmallVector<fir::DoLoopOp> unorderedLoops;

     for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())
       if (nestedLoop.getUnordered())
         unorderedLoops.push_back(nestedLoop);

     if (unorderedLoops.empty())
       break;

     // Having more than one unordered loop means that we are not dealing with a
     // perfect loop nest (i.e. a mulit-range `do concurrent` loop); which is the
     // case we are after here.
     if (unorderedLoops.size() > 1)
       return mlir::failure();

     fir::DoLoopOp nestedUnorderedLoop = unorderedLoops.front();

     if (!isPerfectlyNested(currentLoop, nestedUnorderedLoop))
       return mlir::failure();

     currentLoop = nestedUnorderedLoop;
   }

   return mlir::success();
 }

 /// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In
 /// particular, this function would take this input IR:
 /// ```
 /// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered {
 ///   fir.store %i_iv to %i#1 : !fir.ref<i32>
 ///   %j_lb = arith.constant 1 : i32
 ///   %j_ub = arith.constant 10 : i32
 ///   %j_step = arith.constant 1 : index
 ///
 ///   fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered {
 ///     fir.store %j_iv to %j#1 : !fir.ref<i32>
 ///     ...
 ///   }
 /// }
 /// ```
 ///
 /// into the following form (using generic op form since the result is
 /// technically an invalid `fir.do_loop` op:
 ///
 /// ```
 /// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({
 /// ^bb0(%i_iv: index):
 ///   %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32
 ///   %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32
 ///   %j_step = "arith.constant"() <{value = 1 : index}> : () -> index
 ///
 ///   "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({
 ///   ^bb0(%new_i_iv: index, %new_j_iv: index):
 ///     "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> ()
 ///     "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> ()
 ///     ...
 ///   })
 /// ```
 ///
 /// What happened to the loop nest is the following:
 ///
 /// * the innermost loop's entry block was updated from having one operand to
 ///   having `n` operands where `n` is the number of loops in the nest,
 ///
 /// * the outer loop(s)' ops that update the IVs were sank inside the innermost
 ///   loop (see the `"fir.store"(%new_i_iv, %i#1)` op above),
 ///
 /// * the innermost loop's entry block's arguments were mapped in order from the
 ///   outermost to the innermost IV.
 ///
 /// With this IR change, we can directly inline the innermost loop's region into
 /// the newly generated `omp.loop_nest` op.
 ///
 /// Note that this function has a pre-condition that \p loopNest consists of
 /// perfectly nested loops; i.e. there are no in-between ops between 2 nested
 /// loops except for the ops to setup the inner loop's LB, UB, and step. These
 /// ops are handled/cloned by `genLoopNestClauseOps(..)`.
 void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
                     looputils::LoopNestToIndVarMap &loopNest) {
   if (loopNest.size() <= 1)
     return;

   fir::DoLoopOp innermostLoop = loopNest.back().first;
   mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front();

   llvm::SmallVector<mlir::Type> argTypes;
   llvm::SmallVector<mlir::Location> argLocs;

   for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) {
     // Sink the IV update ops to the innermost loop. We need to do for all loops
     // except for the innermost one, hence the `drop_end` usage above.
     for (mlir::Operation *op : indVarInfo.indVarUpdateOps)
       op->moveBefore(&innermostFirstOp);

     argTypes.push_back(doLoop.getInductionVar().getType());
     argLocs.push_back(doLoop.getInductionVar().getLoc());
   }

   mlir::Region &innermmostRegion = innermostLoop.getRegion();
   // Extend the innermost entry block with arguments to represent the outer IVs.
   innermmostRegion.addArguments(argTypes, argLocs);

   unsigned idx = 1;
   // In reverse, remap the IVs of the loop nest from the old values to the new
   // ones. We do that in reverse since the first argument before this loop is
   // the old IV for the innermost loop. Therefore, we want to replace it first
   // before the old value (1st argument in the block) is remapped to be the IV
   // of the outermost loop in the nest.
   for (auto &[doLoop, _] : llvm::reverse(loopNest)) {
     doLoop.getInductionVar().replaceAllUsesWith(
         innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx));
     ++idx;
   }
 }

 /// Collects values that are local to a loop: "loop-local values". A loop-local
 /// value is one that is used exclusively inside the loop but allocated outside
 /// of it. This usually corresponds to temporary values that are used inside the
 /// loop body for initialzing other variables for example.
 ///
 /// See `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90` for an
 /// example of why we need this.
 ///
 /// \param [in] doLoop - the loop within which the function searches for values
 /// used exclusively inside.
 ///
 /// \param [out] locals - the list of loop-local values detected for \p doLoop.
 void collectLoopLocalValues(fir::DoLoopOp doLoop,
                             llvm::SetVector<mlir::Value> &locals) {
   doLoop.walk([&](mlir::Operation *op) {
     for (mlir::Value operand : op->getOperands()) {
       if (locals.contains(operand))
         continue;

       bool isLocal = true;

       if (!mlir::isa_and_present<fir::AllocaOp>(operand.getDefiningOp()))
         continue;

       // Values defined inside the loop are not interesting since they do not
       // need to be localized.
       if (doLoop->isAncestor(operand.getDefiningOp()))
         continue;

       for (auto *user : operand.getUsers()) {
         if (!doLoop->isAncestor(user)) {
           isLocal = false;
           break;
         }
       }

       if (isLocal)
         locals.insert(operand);
     }
   });
 }

 /// For a "loop-local" value \p local within a loop's scope, localizes that
 /// value within the scope of the parallel region the loop maps to. Towards that
 /// end, this function moves the allocation of \p local within \p allocRegion.
 ///
 /// \param local - the value used exclusively within a loop's scope (see
 /// collectLoopLocalValues).
 ///
 /// \param allocRegion - the parallel region where \p local's allocation will be
 /// privatized.
 ///
 /// \param rewriter - builder used for updating \p allocRegion.
 static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion,
                                    mlir::ConversionPatternRewriter &rewriter) {
   rewriter.moveOpBefore(local.getDefiningOp(), &allocRegion.front().front());
 }
 } // namespace looputils

 class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
 public:
   using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;

   DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice,
                          llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip)
       : OpConversionPattern(context), mapToDevice(mapToDevice),
         concurrentLoopsToSkip(concurrentLoopsToSkip) {}

   mlir::LogicalResult
   matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     if (mapToDevice)
       return doLoop.emitError(
           "not yet implemented: Mapping `do concurrent` loops to device");

     looputils::LoopNestToIndVarMap loopNest;
     bool hasRemainingNestedLoops =
         failed(looputils::collectLoopNest(doLoop, loopNest));
     if (hasRemainingNestedLoops)
       mlir::emitWarning(doLoop.getLoc(),
                         "Some `do concurent` loops are not perfectly-nested. "
                         "These will be serialized.");

     llvm::SetVector<mlir::Value> locals;
     looputils::collectLoopLocalValues(loopNest.back().first, locals);
     looputils::sinkLoopIVArgs(rewriter, loopNest);

     mlir::IRMapping mapper;
     mlir::omp::ParallelOp parallelOp =
         genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
     mlir::omp::LoopNestOperands loopNestClauseOps;
     genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
                          loopNestClauseOps);

     for (mlir::Value local : locals)
       looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
                                         rewriter);

     mlir::omp::LoopNestOp ompLoopNest =
         genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps,
                     /*isComposite=*/mapToDevice);

     rewriter.eraseOp(doLoop);

     // Mark `unordered` loops that are not perfectly nested to be skipped from
     // the legality check of the `ConversionTarget` since we are not interested
     // in mapping them to OpenMP.
     ompLoopNest->walk([&](fir::DoLoopOp doLoop) {
       if (doLoop.getUnordered()) {
         concurrentLoopsToSkip.insert(doLoop);
       }
     });

     return mlir::success();
   }

 private:
   mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
                                       mlir::ConversionPatternRewriter &rewriter,
                                       looputils::LoopNestToIndVarMap &loopNest,
                                       mlir::IRMapping &mapper) const {
     auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
     rewriter.createBlock(&parallelOp.getRegion());
     rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));

     genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
     return parallelOp;
   }

   void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
                                looputils::LoopNestToIndVarMap &loopNest,
                                mlir::IRMapping &mapper) const {

     for (auto &[_, indVarInfo] : loopNest)
       genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
   }

   mlir::Operation *
   genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
                             mlir::Operation *indVarMemDef,
                             mlir::IRMapping &mapper) const {
     assert(
         indVarMemDef != nullptr &&
         "Induction variable memdef is expected to have a defining operation.");

     llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
     for (auto operand : indVarMemDef->getOperands())
       indVarDeclareAndAlloc.insert(operand.getDefiningOp());
     indVarDeclareAndAlloc.insert(indVarMemDef);

     mlir::Operation *result;
     for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
       result = rewriter.clone(*opToClone, mapper);

     return result;
   }

   void genLoopNestClauseOps(
       mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
       looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
       mlir::omp::LoopNestOperands &loopNestClauseOps) const {
     assert(loopNestClauseOps.loopLowerBounds.empty() &&
            "Loop nest bounds were already emitted!");

     auto populateBounds = [](mlir::Value var,
                              llvm::SmallVectorImpl<mlir::Value> &bounds) {
       bounds.push_back(var.getDefiningOp()->getResult(0));
     };

     for (auto &[doLoop, _] : loopNest) {
       populateBounds(doLoop.getLowerBound(), loopNestClauseOps.loopLowerBounds);
       populateBounds(doLoop.getUpperBound(), loopNestClauseOps.loopUpperBounds);
       populateBounds(doLoop.getStep(), loopNestClauseOps.loopSteps);
     }

     loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
   }

   mlir::omp::LoopNestOp
   genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop,
               mlir::IRMapping &mapper,
               const mlir::omp::LoopNestOperands &clauseOps,
               bool isComposite) const {

     auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
     wsloopOp.setComposite(isComposite);
     rewriter.createBlock(&wsloopOp.getRegion());

     auto loopNestOp =
         rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);

     // Clone the loop's body inside the loop nest construct using the
     // mapped values.
     rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
                                loopNestOp.getRegion().begin(), mapper);

     mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator();
     rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
     rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
     rewriter.eraseOp(terminator);

     return loopNestOp;
   }

   bool mapToDevice;
   llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip;
 };

 class DoConcurrentConversionPass
     : public flangomp::impl::DoConcurrentConversionPassBase<
           DoConcurrentConversionPass> {
 public:
   DoConcurrentConversionPass() = default;

   DoConcurrentConversionPass(
       const flangomp::DoConcurrentConversionPassOptions &options)
       : DoConcurrentConversionPassBase(options) {}

   void runOnOperation() override {
     mlir::func::FuncOp func = getOperation();

     if (func.isDeclaration())
       return;

     mlir::MLIRContext *context = &getContext();

     if (mapTo != flangomp::DoConcurrentMappingKind::DCMK_Host &&
         mapTo != flangomp::DoConcurrentMappingKind::DCMK_Device) {
       mlir::emitWarning(mlir::UnknownLoc::get(context),
                         "DoConcurrentConversionPass: invalid `map-to` value. "
                         "Valid values are: `host` or `device`");
       return;
     }

     llvm::DenseSet<fir::DoLoopOp> concurrentLoopsToSkip;
     mlir::RewritePatternSet patterns(context);
     patterns.insert<DoConcurrentConversion>(
         context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
         concurrentLoopsToSkip);
     mlir::ConversionTarget target(*context);
     target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
       // The goal is to handle constructs that eventually get lowered to
       // `fir.do_loop` with the `unordered` attribute (e.g. array expressions).
       // Currently, this is only enabled for the `do concurrent` construct since
       // the pass runs early in the pipeline.
       return !op.getUnordered() || concurrentLoopsToSkip.contains(op);
     });
     target.markUnknownOpDynamicallyLegal(
         [](mlir::Operation *) { return true; });

     if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
                                                std::move(patterns)))) {
       signalPassFailure();
     }
   }
 };
 } // namespace

 std::unique_ptr<mlir::Pass>
 flangomp::createDoConcurrentConversionPass(bool mapToDevice) {
   DoConcurrentConversionPassOptions options;
   options.mapTo = mapToDevice ? flangomp::DoConcurrentMappingKind::DCMK_Device
                               : flangomp::DoConcurrentMappingKind::DCMK_Host;

   return std::make_unique<DoConcurrentConversionPass>(options);
 }
	//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "flang/Optimizer/Dialect/FIROps.h"
	#include "flang/Optimizer/OpenMP/Passes.h"
	#include "flang/Optimizer/OpenMP/Utils.h"
	#include "mlir/Analysis/SliceAnalysis.h"
	#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
	#include "mlir/IR/IRMapping.h"
	#include "mlir/Transforms/DialectConversion.h"
	#include "mlir/Transforms/RegionUtils.h"

	namespace flangomp {
	#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
	#include "flang/Optimizer/OpenMP/Passes.h.inc"
	} // namespace flangomp

	#define DEBUG_TYPE "do-concurrent-conversion"
	#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")

	namespace {
	namespace looputils {
	/// Stores info needed about the induction/iteration variable for each `do
	/// concurrent` in a loop nest.
	struct InductionVariableInfo {
	InductionVariableInfo(fir::DoLoopOp doLoop) { populateInfo(doLoop); }

	/// The operation allocating memory for iteration variable.
	mlir::Operation *iterVarMemDef;
	/// the operation(s) updating the iteration variable with the current
	/// iteration number.
	llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps;

	private:
	/// For the \p doLoop parameter, find the following:
	///
	/// 1. The operation that declares its iteration variable or allocates memory
	/// for it. For example, give the following loop:
	/// ```
	/// ...
	/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
	/// ...
	/// fir.do_loop %ind_var = %lb to %ub step %s unordered {
	/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
	/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
	/// ...
	/// }
	/// ```
	///
	/// This function sets the `iterVarMemDef` member to the `hlfir.declare` op
	/// for `%i`.
	///
	/// 2. The operation(s) that update the loop's iteration variable from its
	/// induction variable. For the above example, the `indVarUpdateOps` is
	/// populated with the first 2 ops in the loop's body.
	///
	/// Note: The current implementation is dependent on how flang emits loop
	/// bodies; which is sufficient for the current simple test/use cases. If this
	/// proves to be insufficient, this should be made more generic.
	void populateInfo(fir::DoLoopOp doLoop) {
	mlir::Value result = nullptr;

	// Checks if a StoreOp is updating the memref of the loop's iteration
	// variable.
	auto isStoringIV = [&](fir::StoreOp storeOp) {
	// Direct store into the IV memref.
	if (storeOp.getValue() == doLoop.getInductionVar()) {
	indVarUpdateOps.push_back(storeOp);
	return true;
	}

	// Indirect store into the IV memref.
	if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
	storeOp.getValue().getDefiningOp())) {
	if (convertOp.getOperand() == doLoop.getInductionVar()) {
	indVarUpdateOps.push_back(convertOp);
	indVarUpdateOps.push_back(storeOp);
	return true;
	}
	}

	return false;
	};

	for (mlir::Operation &op : doLoop) {
	if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
	if (isStoringIV(storeOp)) {
	result = storeOp.getMemref();
	break;
	}
	}

	assert(result != nullptr && result.getDefiningOp() != nullptr);
	iterVarMemDef = result.getDefiningOp();
	}
	};

	using LoopNestToIndVarMap =
	llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;

	/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
	/// there are no operations in \p outerloop's body other than:
	///
	/// 1. the operations needed to assign/update \p outerLoop's induction variable.
	/// 2. \p innerLoop itself.
	///
	/// \p return true if \p innerLoop is perfectly nested inside \p outerLoop
	/// according to the above definition.
	bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
	mlir::ForwardSliceOptions forwardSliceOptions;
	forwardSliceOptions.inclusive = true;
	// The following will be used as an example to clarify the internals of this
	// function:
	// ```
	// 1. fir.do_loop %i_idx = %34 to %36 step %c1 unordered {
	// 2. %i_idx_2 = fir.convert %i_idx : (index) -> i32
	// 3. fir.store %i_idx_2 to %i_iv#1 : !fir.ref<i32>
	//
	// 4. fir.do_loop %j_idx = %37 to %39 step %c1_3 unordered {
	// 5. %j_idx_2 = fir.convert %j_idx : (index) -> i32
	// 6. fir.store %j_idx_2 to %j_iv#1 : !fir.ref<i32>
	// ... loop nest body, possible uses %i_idx ...
	// }
	// }
	// ```
	// In this example, the `j` loop is perfectly nested inside the `i` loop and
	// below is how we find that.

	// We don't care about the outer-loop's induction variable's uses within the
	// inner-loop, so we filter out these uses.
	//
	// This filter tells `getForwardSlice` (below) to only collect operations
	// which produce results defined above (i.e. outside) the inner-loop's body.
	//
	// Since `outerLoop.getInductionVar()` is a block argument (to the
	// outer-loop's body), the filter effectively collects uses of
	// `outerLoop.getInductionVar()` inside the outer-loop but outside the
	// inner-loop.
	forwardSliceOptions.filter = [&](mlir::Operation *op) {
	return mlir::areValuesDefinedAbove(op->getResults(), innerLoop.getRegion());
	};

	llvm::SetVector<mlir::Operation *> indVarSlice;
	// The forward slice of the `i` loop's IV will be the 2 ops in line 1 & 2
	// above. Uses of `%i_idx` inside the `j` loop are not collected because of
	// the filter.
	mlir::getForwardSlice(outerLoop.getInductionVar(), &indVarSlice,
	forwardSliceOptions);
	llvm::DenseSet<mlir::Operation *> indVarSet(indVarSlice.begin(),
	indVarSlice.end());

	llvm::DenseSet<mlir::Operation *> outerLoopBodySet;
	// The following walk collects ops inside `outerLoop` that are not:
	// * the outer-loop itself,
	// * or the inner-loop,
	// * or the `fir.result` op (the outer-loop's terminator).
	//
	// For the above example, this will also populate `outerLoopBodySet` with ops
	// in line 1 & 2 since we skip the `i` loop, the `j` loop, and the terminator.
	outerLoop.walk<mlir::WalkOrder::PreOrder>([&](mlir::Operation *op) {
	if (op == outerLoop)
	return mlir::WalkResult::advance();

	if (op == innerLoop)
	return mlir::WalkResult::skip();

	if (mlir::isa<fir::ResultOp>(op))
	return mlir::WalkResult::advance();

	outerLoopBodySet.insert(op);
	return mlir::WalkResult::advance();
	});

	// If `outerLoopBodySet` ends up having the same ops as `indVarSet`, then
	// `outerLoop` only contains ops that setup its induction variable +
	// `innerLoop` + the `fir.result` terminator. In other words, `innerLoop` is
	// perfectly nested inside `outerLoop`.
	bool result = (outerLoopBodySet == indVarSet);
	LLVM_DEBUG(DBGS() << "Loop pair starting at location " << outerLoop.getLoc()
	<< " is" << (result ? "" : " not")
	<< " perfectly nested\n");

	return result;
	}

	/// Starting with `currentLoop` collect a perfectly nested loop nest, if any.
	/// This function collects as much as possible loops in the nest; it case it
	/// fails to recognize a certain nested loop as part of the nest it just returns
	/// the parent loops it discovered before.
	mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
	LoopNestToIndVarMap &loopNest) {
	assert(currentLoop.getUnordered());

	while (true) {
	loopNest.insert({currentLoop, InductionVariableInfo(currentLoop)});
	llvm::SmallVector<fir::DoLoopOp> unorderedLoops;

	for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())
	if (nestedLoop.getUnordered())
	unorderedLoops.push_back(nestedLoop);

	if (unorderedLoops.empty())
	break;

	// Having more than one unordered loop means that we are not dealing with a
	// perfect loop nest (i.e. a mulit-range `do concurrent` loop); which is the
	// case we are after here.
	if (unorderedLoops.size() > 1)
	return mlir::failure();

	fir::DoLoopOp nestedUnorderedLoop = unorderedLoops.front();

	if (!isPerfectlyNested(currentLoop, nestedUnorderedLoop))
	return mlir::failure();

	currentLoop = nestedUnorderedLoop;
	}

	return mlir::success();
	}

	/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In
	/// particular, this function would take this input IR:
	/// ```
	/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered {
	/// fir.store %i_iv to %i#1 : !fir.ref<i32>
	/// %j_lb = arith.constant 1 : i32
	/// %j_ub = arith.constant 10 : i32
	/// %j_step = arith.constant 1 : index
	///
	/// fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered {
	/// fir.store %j_iv to %j#1 : !fir.ref<i32>
	/// ...
	/// }
	/// }
	/// ```
	///
	/// into the following form (using generic op form since the result is
	/// technically an invalid `fir.do_loop` op:
	///
	/// ```
	/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({
	/// ^bb0(%i_iv: index):
	/// %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32
	/// %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32
	/// %j_step = "arith.constant"() <{value = 1 : index}> : () -> index
	///
	/// "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({
	/// ^bb0(%new_i_iv: index, %new_j_iv: index):
	/// "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> ()
	/// "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> ()
	/// ...
	/// })
	/// ```
	///
	/// What happened to the loop nest is the following:
	///
	/// * the innermost loop's entry block was updated from having one operand to
	/// having `n` operands where `n` is the number of loops in the nest,
	///
	/// * the outer loop(s)' ops that update the IVs were sank inside the innermost
	/// loop (see the `"fir.store"(%new_i_iv, %i#1)` op above),
	///
	/// * the innermost loop's entry block's arguments were mapped in order from the
	/// outermost to the innermost IV.
	///
	/// With this IR change, we can directly inline the innermost loop's region into
	/// the newly generated `omp.loop_nest` op.
	///
	/// Note that this function has a pre-condition that \p loopNest consists of
	/// perfectly nested loops; i.e. there are no in-between ops between 2 nested
	/// loops except for the ops to setup the inner loop's LB, UB, and step. These
	/// ops are handled/cloned by `genLoopNestClauseOps(..)`.
	void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
	looputils::LoopNestToIndVarMap &loopNest) {
	if (loopNest.size() <= 1)
	return;

	fir::DoLoopOp innermostLoop = loopNest.back().first;
	mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front();

	llvm::SmallVector<mlir::Type> argTypes;
	llvm::SmallVector<mlir::Location> argLocs;

	for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) {
	// Sink the IV update ops to the innermost loop. We need to do for all loops
	// except for the innermost one, hence the `drop_end` usage above.
	for (mlir::Operation *op : indVarInfo.indVarUpdateOps)
	op->moveBefore(&innermostFirstOp);

	argTypes.push_back(doLoop.getInductionVar().getType());
	argLocs.push_back(doLoop.getInductionVar().getLoc());
	}

	mlir::Region &innermmostRegion = innermostLoop.getRegion();
	// Extend the innermost entry block with arguments to represent the outer IVs.
	innermmostRegion.addArguments(argTypes, argLocs);

	unsigned idx = 1;
	// In reverse, remap the IVs of the loop nest from the old values to the new
	// ones. We do that in reverse since the first argument before this loop is
	// the old IV for the innermost loop. Therefore, we want to replace it first
	// before the old value (1st argument in the block) is remapped to be the IV
	// of the outermost loop in the nest.
	for (auto &[doLoop, _] : llvm::reverse(loopNest)) {
	doLoop.getInductionVar().replaceAllUsesWith(
	innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx));
	++idx;
	}
	}

	/// Collects values that are local to a loop: "loop-local values". A loop-local
	/// value is one that is used exclusively inside the loop but allocated outside
	/// of it. This usually corresponds to temporary values that are used inside the
	/// loop body for initialzing other variables for example.
	///
	/// See `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90` for an
	/// example of why we need this.
	///
	/// \param [in] doLoop - the loop within which the function searches for values
	/// used exclusively inside.
	///
	/// \param [out] locals - the list of loop-local values detected for \p doLoop.
	void collectLoopLocalValues(fir::DoLoopOp doLoop,
	llvm::SetVector<mlir::Value> &locals) {
	doLoop.walk([&](mlir::Operation *op) {
	for (mlir::Value operand : op->getOperands()) {
	if (locals.contains(operand))
	continue;

	bool isLocal = true;

	if (!mlir::isa_and_present<fir::AllocaOp>(operand.getDefiningOp()))
	continue;

	// Values defined inside the loop are not interesting since they do not
	// need to be localized.
	if (doLoop->isAncestor(operand.getDefiningOp()))
	continue;

	for (auto *user : operand.getUsers()) {
	if (!doLoop->isAncestor(user)) {
	isLocal = false;
	break;
	}
	}

	if (isLocal)
	locals.insert(operand);
	}
	});
	}

	/// For a "loop-local" value \p local within a loop's scope, localizes that
	/// value within the scope of the parallel region the loop maps to. Towards that
	/// end, this function moves the allocation of \p local within \p allocRegion.
	///
	/// \param local - the value used exclusively within a loop's scope (see
	/// collectLoopLocalValues).
	///
	/// \param allocRegion - the parallel region where \p local's allocation will be
	/// privatized.
	///
	/// \param rewriter - builder used for updating \p allocRegion.
	static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion,
	mlir::ConversionPatternRewriter &rewriter) {
	rewriter.moveOpBefore(local.getDefiningOp(), &allocRegion.front().front());
	}
	} // namespace looputils

	class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
	public:
	using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;

	DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice,
	llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip)
	: OpConversionPattern(context), mapToDevice(mapToDevice),
	concurrentLoopsToSkip(concurrentLoopsToSkip) {}

	mlir::LogicalResult
	matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
	mlir::ConversionPatternRewriter &rewriter) const override {
	if (mapToDevice)
	return doLoop.emitError(
	"not yet implemented: Mapping `do concurrent` loops to device");

	looputils::LoopNestToIndVarMap loopNest;
	bool hasRemainingNestedLoops =
	failed(looputils::collectLoopNest(doLoop, loopNest));
	if (hasRemainingNestedLoops)
	mlir::emitWarning(doLoop.getLoc(),
	"Some `do concurent` loops are not perfectly-nested. "
	"These will be serialized.");

	llvm::SetVector<mlir::Value> locals;
	looputils::collectLoopLocalValues(loopNest.back().first, locals);
	looputils::sinkLoopIVArgs(rewriter, loopNest);

	mlir::IRMapping mapper;
	mlir::omp::ParallelOp parallelOp =
	genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
	mlir::omp::LoopNestOperands loopNestClauseOps;
	genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
	loopNestClauseOps);

	for (mlir::Value local : locals)
	looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
	rewriter);

	mlir::omp::LoopNestOp ompLoopNest =
	genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps,
	/isComposite=/mapToDevice);

	rewriter.eraseOp(doLoop);

	// Mark `unordered` loops that are not perfectly nested to be skipped from
	// the legality check of the `ConversionTarget` since we are not interested
	// in mapping them to OpenMP.
	ompLoopNest->walk([&](fir::DoLoopOp doLoop) {
	if (doLoop.getUnordered()) {
	concurrentLoopsToSkip.insert(doLoop);
	}
	});

	return mlir::success();
	}

	private:
	mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
	mlir::ConversionPatternRewriter &rewriter,
	looputils::LoopNestToIndVarMap &loopNest,
	mlir::IRMapping &mapper) const {
	auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
	rewriter.createBlock(&parallelOp.getRegion());
	rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));

	genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
	return parallelOp;
	}

	void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
	looputils::LoopNestToIndVarMap &loopNest,
	mlir::IRMapping &mapper) const {

	for (auto &[_, indVarInfo] : loopNest)
	genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
	}

	mlir::Operation *
	genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
	mlir::Operation *indVarMemDef,
	mlir::IRMapping &mapper) const {
	assert(
	indVarMemDef != nullptr &&
	"Induction variable memdef is expected to have a defining operation.");

	llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
	for (auto operand : indVarMemDef->getOperands())
	indVarDeclareAndAlloc.insert(operand.getDefiningOp());
	indVarDeclareAndAlloc.insert(indVarMemDef);

	mlir::Operation *result;
	for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
	result = rewriter.clone(*opToClone, mapper);

	return result;
	}

	void genLoopNestClauseOps(
	mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
	looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
	mlir::omp::LoopNestOperands &loopNestClauseOps) const {
	assert(loopNestClauseOps.loopLowerBounds.empty() &&
	"Loop nest bounds were already emitted!");

	auto populateBounds = [](mlir::Value var,
	llvm::SmallVectorImpl<mlir::Value> &bounds) {
	bounds.push_back(var.getDefiningOp()->getResult(0));
	};

	for (auto &[doLoop, _] : loopNest) {
	populateBounds(doLoop.getLowerBound(), loopNestClauseOps.loopLowerBounds);
	populateBounds(doLoop.getUpperBound(), loopNestClauseOps.loopUpperBounds);
	populateBounds(doLoop.getStep(), loopNestClauseOps.loopSteps);
	}

	loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
	}

	mlir::omp::LoopNestOp
	genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop,
	mlir::IRMapping &mapper,
	const mlir::omp::LoopNestOperands &clauseOps,
	bool isComposite) const {

	auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
	wsloopOp.setComposite(isComposite);
	rewriter.createBlock(&wsloopOp.getRegion());

	auto loopNestOp =
	rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);

	// Clone the loop's body inside the loop nest construct using the
	// mapped values.
	rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
	loopNestOp.getRegion().begin(), mapper);

	mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator();
	rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
	rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
	rewriter.eraseOp(terminator);

	return loopNestOp;
	}

	bool mapToDevice;
	llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip;
	};

	class DoConcurrentConversionPass
	: public flangomp::impl::DoConcurrentConversionPassBase<
	DoConcurrentConversionPass> {
	public:
	DoConcurrentConversionPass() = default;

	DoConcurrentConversionPass(
	const flangomp::DoConcurrentConversionPassOptions &options)
	: DoConcurrentConversionPassBase(options) {}

	void runOnOperation() override {
	mlir::func::FuncOp func = getOperation();

	if (func.isDeclaration())
	return;

	mlir::MLIRContext *context = &getContext();

	if (mapTo != flangomp::DoConcurrentMappingKind::DCMK_Host &&
	mapTo != flangomp::DoConcurrentMappingKind::DCMK_Device) {
	mlir::emitWarning(mlir::UnknownLoc::get(context),
	"DoConcurrentConversionPass: invalid `map-to` value. "
	"Valid values are: `host` or `device`");
	return;
	}

	llvm::DenseSet<fir::DoLoopOp> concurrentLoopsToSkip;
	mlir::RewritePatternSet patterns(context);
	patterns.insert<DoConcurrentConversion>(
	context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
	concurrentLoopsToSkip);
	mlir::ConversionTarget target(*context);
	target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
	// The goal is to handle constructs that eventually get lowered to
	// `fir.do_loop` with the `unordered` attribute (e.g. array expressions).
	// Currently, this is only enabled for the `do concurrent` construct since
	// the pass runs early in the pipeline.
	return !op.getUnordered() \|\| concurrentLoopsToSkip.contains(op);
	});
	target.markUnknownOpDynamicallyLegal(
	[](mlir::Operation *) { return true; });

	if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
	std::move(patterns)))) {
	signalPassFailure();
	}
	}
	};
	} // namespace

	std::unique_ptr<mlir::Pass>
	flangomp::createDoConcurrentConversionPass(bool mapToDevice) {
	DoConcurrentConversionPassOptions options;
	options.mapTo = mapToDevice ? flangomp::DoConcurrentMappingKind::DCMK_Device
	: flangomp::DoConcurrentMappingKind::DCMK_Host;

	return std::make_unique<DoConcurrentConversionPass>(options);
	}