flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp - llvm-project - Git at Google

 //===- GenericLoopConversion.cpp ------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "flang/Common/OpenMP-utils.h"

 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"

 #include <memory>

 namespace flangomp {
 #define GEN_PASS_DEF_GENERICLOOPCONVERSIONPASS
 #include "flang/Optimizer/OpenMP/Passes.h.inc"
 } // namespace flangomp

 namespace {

 /// A conversion pattern to handle various combined forms of `omp.loop`. For how
 /// combined/composite directive are handled see:
 /// https://discourse.llvm.org/t/rfc-representing-combined-composite-constructs-in-the-openmp-dialect/76986.
 class GenericLoopConversionPattern
     : public mlir::OpConversionPattern<mlir::omp::LoopOp> {
 public:
   enum class GenericLoopCombinedInfo { Standalone, TeamsLoop, ParallelLoop };

   using mlir::OpConversionPattern<mlir::omp::LoopOp>::OpConversionPattern;

   explicit GenericLoopConversionPattern(mlir::MLIRContext *ctx)
       : mlir::OpConversionPattern<mlir::omp::LoopOp>{ctx} {
     // Enable rewrite recursion to make sure nested `loop` directives are
     // handled.
     this->setHasBoundedRewriteRecursion(true);
   }

   mlir::LogicalResult
   matchAndRewrite(mlir::omp::LoopOp loopOp, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     assert(mlir::succeeded(checkLoopConversionSupportStatus(loopOp)));

     GenericLoopCombinedInfo combinedInfo = findGenericLoopCombineInfo(loopOp);

     switch (combinedInfo) {
     case GenericLoopCombinedInfo::Standalone:
       rewriteStandaloneLoop(loopOp, rewriter);
       break;
     case GenericLoopCombinedInfo::ParallelLoop:
       llvm_unreachable(
           "not yet implemented: Combined `parallel loop` directive");
       break;
     case GenericLoopCombinedInfo::TeamsLoop:
       rewriteToDistributeParallelDo(loopOp, rewriter);
       break;
     }

     rewriter.eraseOp(loopOp);
     return mlir::success();
   }

   static mlir::LogicalResult
   checkLoopConversionSupportStatus(mlir::omp::LoopOp loopOp) {
     GenericLoopCombinedInfo combinedInfo = findGenericLoopCombineInfo(loopOp);

     switch (combinedInfo) {
     case GenericLoopCombinedInfo::Standalone:
       break;
     case GenericLoopCombinedInfo::ParallelLoop:
       return loopOp.emitError(
           "not yet implemented: Combined `parallel loop` directive");
     case GenericLoopCombinedInfo::TeamsLoop:
       break;
     }

     auto todo = [&loopOp](mlir::StringRef clauseName) {
       return loopOp.emitError()
              << "not yet implemented: Unhandled clause " << clauseName << " in "
              << loopOp->getName() << " operation";
     };

     // For standalone directives, `bind` is already supported. Other combined
     // forms will be supported in a follow-up PR.
     if (combinedInfo != GenericLoopCombinedInfo::Standalone &&
         loopOp.getBindKind())
       return todo("bind");

     if (loopOp.getOrder())
       return todo("order");

     if (!loopOp.getReductionVars().empty())
       return todo("reduction");

     // TODO For `teams loop`, check similar constrains to what is checked
     // by `TeamsLoopChecker` in SemaOpenMP.cpp.
     return mlir::success();
   }

 private:
   static GenericLoopCombinedInfo
   findGenericLoopCombineInfo(mlir::omp::LoopOp loopOp) {
     mlir::Operation *parentOp = loopOp->getParentOp();
     GenericLoopCombinedInfo result = GenericLoopCombinedInfo::Standalone;

     if (auto teamsOp = mlir::dyn_cast_if_present<mlir::omp::TeamsOp>(parentOp))
       result = GenericLoopCombinedInfo::TeamsLoop;

     if (auto parallelOp =
             mlir::dyn_cast_if_present<mlir::omp::ParallelOp>(parentOp))
       result = GenericLoopCombinedInfo::ParallelLoop;

     return result;
   }

   void rewriteStandaloneLoop(mlir::omp::LoopOp loopOp,
                              mlir::ConversionPatternRewriter &rewriter) const {
     using namespace mlir::omp;
     std::optional<ClauseBindKind> bindKind = loopOp.getBindKind();

     if (!bindKind.has_value())
       return rewriteToSimdLoop(loopOp, rewriter);

     switch (*loopOp.getBindKind()) {
     case ClauseBindKind::Parallel:
       return rewriteToWsloop(loopOp, rewriter);
     case ClauseBindKind::Teams:
       return rewriteToDistrbute(loopOp, rewriter);
     case ClauseBindKind::Thread:
       return rewriteToSimdLoop(loopOp, rewriter);
     }
   }

   /// Rewrites standalone `loop` (without `bind` clause or with
   /// `bind(parallel)`) directives to equivalent `simd` constructs.
   ///
   /// The reasoning behind this decision is that according to the spec (version
   /// 5.2, section 11.7.1):
   ///
   /// "If the bind clause is not specified on a construct for which it may be
   /// specified and the construct is closely nested inside a teams or parallel
   /// construct, the effect is as if binding is teams or parallel. If none of
   /// those conditions hold, the binding region is not defined."
   ///
   /// which means that standalone `loop` directives have undefined binding
   /// region. Moreover, the spec says (in the next paragraph):
   ///
   /// "The specified binding region determines the binding thread set.
   /// Specifically, if the binding region is a teams region, then the binding
   /// thread set is the set of initial threads that are executing that region
   /// while if the binding region is a parallel region, then the binding thread
   /// set is the team of threads that are executing that region. If the binding
   /// region is not defined, then the binding thread set is the encountering
   /// thread."
   ///
   /// which means that the binding thread set for a standalone `loop` directive
   /// is only the encountering thread.
   ///
   /// Since the encountering thread is the binding thread (set) for a
   /// standalone `loop` directive, the best we can do in such case is to "simd"
   /// the directive.
   void rewriteToSimdLoop(mlir::omp::LoopOp loopOp,
                          mlir::ConversionPatternRewriter &rewriter) const {
     loopOp.emitWarning(
         "Detected standalone OpenMP `loop` directive with thread binding, "
         "the associated loop will be rewritten to `simd`.");
     rewriteToSingleWrapperOp<mlir::omp::SimdOp, mlir::omp::SimdOperands>(
         loopOp, rewriter);
   }

   void rewriteToDistrbute(mlir::omp::LoopOp loopOp,
                           mlir::ConversionPatternRewriter &rewriter) const {
     rewriteToSingleWrapperOp<mlir::omp::DistributeOp,
                              mlir::omp::DistributeOperands>(loopOp, rewriter);
   }

   void rewriteToWsloop(mlir::omp::LoopOp loopOp,
                        mlir::ConversionPatternRewriter &rewriter) const {
     rewriteToSingleWrapperOp<mlir::omp::WsloopOp, mlir::omp::WsloopOperands>(
         loopOp, rewriter);
   }

   // TODO Suggestion by Sergio: tag auto-generated operations for constructs
   // that weren't part of the original program, that would be useful
   // information for debugging purposes later on. This new attribute could be
   // used for `omp.loop`, but also for `do concurrent` transformations,
   // `workshare`, `workdistribute`, etc. The tag could be used for all kinds of
   // auto-generated operations using a dialect attribute (named something like
   // `omp.origin` or `omp.derived`) and perhaps hold the name of the operation
   // it was derived from, the reason it was transformed or something like that
   // we could use when emitting any messages related to it later on.
   template <typename OpTy, typename OpOperandsTy>
   void
   rewriteToSingleWrapperOp(mlir::omp::LoopOp loopOp,
                            mlir::ConversionPatternRewriter &rewriter) const {
     OpOperandsTy clauseOps;
     clauseOps.privateVars = loopOp.getPrivateVars();

     auto privateSyms = loopOp.getPrivateSyms();
     if (privateSyms)
       clauseOps.privateSyms.assign(privateSyms->begin(), privateSyms->end());

     Fortran::common::openmp::EntryBlockArgs args;
     args.priv.vars = clauseOps.privateVars;

     auto wrapperOp = rewriter.create<OpTy>(loopOp.getLoc(), clauseOps);
     mlir::Block *opBlock = genEntryBlock(rewriter, args, wrapperOp.getRegion());

     mlir::IRMapping mapper;
     mlir::Block &loopBlock = *loopOp.getRegion().begin();

     for (auto [loopOpArg, opArg] :
          llvm::zip_equal(loopBlock.getArguments(), opBlock->getArguments()))
       mapper.map(loopOpArg, opArg);

     rewriter.clone(*loopOp.begin(), mapper);
   }

   void rewriteToDistributeParallelDo(
       mlir::omp::LoopOp loopOp,
       mlir::ConversionPatternRewriter &rewriter) const {
     mlir::omp::ParallelOperands parallelClauseOps;
     parallelClauseOps.privateVars = loopOp.getPrivateVars();

     auto privateSyms = loopOp.getPrivateSyms();
     if (privateSyms)
       parallelClauseOps.privateSyms.assign(privateSyms->begin(),
                                            privateSyms->end());

     Fortran::common::openmp::EntryBlockArgs parallelArgs;
     parallelArgs.priv.vars = parallelClauseOps.privateVars;

     auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loopOp.getLoc(),
                                                              parallelClauseOps);
     mlir::Block *parallelBlock =
         genEntryBlock(rewriter, parallelArgs, parallelOp.getRegion());
     parallelOp.setComposite(true);
     rewriter.setInsertionPoint(
         rewriter.create<mlir::omp::TerminatorOp>(loopOp.getLoc()));

     mlir::omp::DistributeOperands distributeClauseOps;
     auto distributeOp = rewriter.create<mlir::omp::DistributeOp>(
         loopOp.getLoc(), distributeClauseOps);
     distributeOp.setComposite(true);
     rewriter.createBlock(&distributeOp.getRegion());

     mlir::omp::WsloopOperands wsloopClauseOps;
     auto wsloopOp =
         rewriter.create<mlir::omp::WsloopOp>(loopOp.getLoc(), wsloopClauseOps);
     wsloopOp.setComposite(true);
     rewriter.createBlock(&wsloopOp.getRegion());

     mlir::IRMapping mapper;
     mlir::Block &loopBlock = *loopOp.getRegion().begin();

     for (auto [loopOpArg, parallelOpArg] : llvm::zip_equal(
              loopBlock.getArguments(), parallelBlock->getArguments()))
       mapper.map(loopOpArg, parallelOpArg);

     rewriter.clone(*loopOp.begin(), mapper);
   }
 };

 class GenericLoopConversionPass
     : public flangomp::impl::GenericLoopConversionPassBase<
           GenericLoopConversionPass> {
 public:
   GenericLoopConversionPass() = default;

   void runOnOperation() override {
     mlir::func::FuncOp func = getOperation();

     if (func.isDeclaration())
       return;

     mlir::MLIRContext *context = &getContext();
     mlir::RewritePatternSet patterns(context);
     patterns.insert<GenericLoopConversionPattern>(context);
     mlir::ConversionTarget target(*context);

     target.markUnknownOpDynamicallyLegal(
         [](mlir::Operation *) { return true; });
     target.addDynamicallyLegalOp<mlir::omp::LoopOp>(
         [](mlir::omp::LoopOp loopOp) {
           return mlir::failed(
               GenericLoopConversionPattern::checkLoopConversionSupportStatus(
                   loopOp));
         });

     if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
                                                std::move(patterns)))) {
       mlir::emitError(func.getLoc(), "error in converting `omp.loop` op");
       signalPassFailure();
     }
   }
 };
 } // namespace
	//===- GenericLoopConversion.cpp ------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "flang/Common/OpenMP-utils.h"

	#include "mlir/Dialect/Func/IR/FuncOps.h"
	#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
	#include "mlir/IR/IRMapping.h"
	#include "mlir/Pass/Pass.h"
	#include "mlir/Transforms/DialectConversion.h"

	#include <memory>

	namespace flangomp {
	#define GEN_PASS_DEF_GENERICLOOPCONVERSIONPASS
	#include "flang/Optimizer/OpenMP/Passes.h.inc"
	} // namespace flangomp

	namespace {

	/// A conversion pattern to handle various combined forms of `omp.loop`. For how
	/// combined/composite directive are handled see:
	/// https://discourse.llvm.org/t/rfc-representing-combined-composite-constructs-in-the-openmp-dialect/76986.
	class GenericLoopConversionPattern
	: public mlir::OpConversionPattern<mlir::omp::LoopOp> {
	public:
	enum class GenericLoopCombinedInfo { Standalone, TeamsLoop, ParallelLoop };

	using mlir::OpConversionPattern<mlir::omp::LoopOp>::OpConversionPattern;

	explicit GenericLoopConversionPattern(mlir::MLIRContext *ctx)
	: mlir::OpConversionPattern<mlir::omp::LoopOp>{ctx} {
	// Enable rewrite recursion to make sure nested `loop` directives are
	// handled.
	this->setHasBoundedRewriteRecursion(true);
	}

	mlir::LogicalResult
	matchAndRewrite(mlir::omp::LoopOp loopOp, OpAdaptor adaptor,
	mlir::ConversionPatternRewriter &rewriter) const override {
	assert(mlir::succeeded(checkLoopConversionSupportStatus(loopOp)));

	GenericLoopCombinedInfo combinedInfo = findGenericLoopCombineInfo(loopOp);

	switch (combinedInfo) {
	case GenericLoopCombinedInfo::Standalone:
	rewriteStandaloneLoop(loopOp, rewriter);
	break;
	case GenericLoopCombinedInfo::ParallelLoop:
	llvm_unreachable(
	"not yet implemented: Combined `parallel loop` directive");
	break;
	case GenericLoopCombinedInfo::TeamsLoop:
	rewriteToDistributeParallelDo(loopOp, rewriter);
	break;
	}

	rewriter.eraseOp(loopOp);
	return mlir::success();
	}

	static mlir::LogicalResult
	checkLoopConversionSupportStatus(mlir::omp::LoopOp loopOp) {
	GenericLoopCombinedInfo combinedInfo = findGenericLoopCombineInfo(loopOp);

	switch (combinedInfo) {
	case GenericLoopCombinedInfo::Standalone:
	break;
	case GenericLoopCombinedInfo::ParallelLoop:
	return loopOp.emitError(
	"not yet implemented: Combined `parallel loop` directive");
	case GenericLoopCombinedInfo::TeamsLoop:
	break;
	}

	auto todo = [&loopOp](mlir::StringRef clauseName) {
	return loopOp.emitError()
	<< "not yet implemented: Unhandled clause " << clauseName << " in "
	<< loopOp->getName() << " operation";
	};

	// For standalone directives, `bind` is already supported. Other combined
	// forms will be supported in a follow-up PR.
	if (combinedInfo != GenericLoopCombinedInfo::Standalone &&
	loopOp.getBindKind())
	return todo("bind");

	if (loopOp.getOrder())
	return todo("order");

	if (!loopOp.getReductionVars().empty())
	return todo("reduction");

	// TODO For `teams loop`, check similar constrains to what is checked
	// by `TeamsLoopChecker` in SemaOpenMP.cpp.
	return mlir::success();
	}

	private:
	static GenericLoopCombinedInfo
	findGenericLoopCombineInfo(mlir::omp::LoopOp loopOp) {
	mlir::Operation *parentOp = loopOp->getParentOp();
	GenericLoopCombinedInfo result = GenericLoopCombinedInfo::Standalone;

	if (auto teamsOp = mlir::dyn_cast_if_present<mlir::omp::TeamsOp>(parentOp))
	result = GenericLoopCombinedInfo::TeamsLoop;

	if (auto parallelOp =
	mlir::dyn_cast_if_present<mlir::omp::ParallelOp>(parentOp))
	result = GenericLoopCombinedInfo::ParallelLoop;

	return result;
	}

	void rewriteStandaloneLoop(mlir::omp::LoopOp loopOp,
	mlir::ConversionPatternRewriter &rewriter) const {
	using namespace mlir::omp;
	std::optional<ClauseBindKind> bindKind = loopOp.getBindKind();

	if (!bindKind.has_value())
	return rewriteToSimdLoop(loopOp, rewriter);

	switch (*loopOp.getBindKind()) {
	case ClauseBindKind::Parallel:
	return rewriteToWsloop(loopOp, rewriter);
	case ClauseBindKind::Teams:
	return rewriteToDistrbute(loopOp, rewriter);
	case ClauseBindKind::Thread:
	return rewriteToSimdLoop(loopOp, rewriter);
	}
	}

	/// Rewrites standalone `loop` (without `bind` clause or with
	/// `bind(parallel)`) directives to equivalent `simd` constructs.
	///
	/// The reasoning behind this decision is that according to the spec (version
	/// 5.2, section 11.7.1):
	///
	/// "If the bind clause is not specified on a construct for which it may be
	/// specified and the construct is closely nested inside a teams or parallel
	/// construct, the effect is as if binding is teams or parallel. If none of
	/// those conditions hold, the binding region is not defined."
	///
	/// which means that standalone `loop` directives have undefined binding
	/// region. Moreover, the spec says (in the next paragraph):
	///
	/// "The specified binding region determines the binding thread set.
	/// Specifically, if the binding region is a teams region, then the binding
	/// thread set is the set of initial threads that are executing that region
	/// while if the binding region is a parallel region, then the binding thread
	/// set is the team of threads that are executing that region. If the binding
	/// region is not defined, then the binding thread set is the encountering
	/// thread."
	///
	/// which means that the binding thread set for a standalone `loop` directive
	/// is only the encountering thread.
	///
	/// Since the encountering thread is the binding thread (set) for a
	/// standalone `loop` directive, the best we can do in such case is to "simd"
	/// the directive.
	void rewriteToSimdLoop(mlir::omp::LoopOp loopOp,
	mlir::ConversionPatternRewriter &rewriter) const {
	loopOp.emitWarning(
	"Detected standalone OpenMP `loop` directive with thread binding, "
	"the associated loop will be rewritten to `simd`.");
	rewriteToSingleWrapperOp<mlir::omp::SimdOp, mlir::omp::SimdOperands>(
	loopOp, rewriter);
	}

	void rewriteToDistrbute(mlir::omp::LoopOp loopOp,
	mlir::ConversionPatternRewriter &rewriter) const {
	rewriteToSingleWrapperOp<mlir::omp::DistributeOp,
	mlir::omp::DistributeOperands>(loopOp, rewriter);
	}

	void rewriteToWsloop(mlir::omp::LoopOp loopOp,
	mlir::ConversionPatternRewriter &rewriter) const {
	rewriteToSingleWrapperOp<mlir::omp::WsloopOp, mlir::omp::WsloopOperands>(
	loopOp, rewriter);
	}

	// TODO Suggestion by Sergio: tag auto-generated operations for constructs
	// that weren't part of the original program, that would be useful
	// information for debugging purposes later on. This new attribute could be
	// used for `omp.loop`, but also for `do concurrent` transformations,
	// `workshare`, `workdistribute`, etc. The tag could be used for all kinds of
	// auto-generated operations using a dialect attribute (named something like
	// `omp.origin` or `omp.derived`) and perhaps hold the name of the operation
	// it was derived from, the reason it was transformed or something like that
	// we could use when emitting any messages related to it later on.
	template <typename OpTy, typename OpOperandsTy>
	void
	rewriteToSingleWrapperOp(mlir::omp::LoopOp loopOp,
	mlir::ConversionPatternRewriter &rewriter) const {
	OpOperandsTy clauseOps;
	clauseOps.privateVars = loopOp.getPrivateVars();

	auto privateSyms = loopOp.getPrivateSyms();
	if (privateSyms)
	clauseOps.privateSyms.assign(privateSyms->begin(), privateSyms->end());

	Fortran::common::openmp::EntryBlockArgs args;
	args.priv.vars = clauseOps.privateVars;

	auto wrapperOp = rewriter.create<OpTy>(loopOp.getLoc(), clauseOps);
	mlir::Block *opBlock = genEntryBlock(rewriter, args, wrapperOp.getRegion());

	mlir::IRMapping mapper;
	mlir::Block &loopBlock = *loopOp.getRegion().begin();

	for (auto [loopOpArg, opArg] :
	llvm::zip_equal(loopBlock.getArguments(), opBlock->getArguments()))
	mapper.map(loopOpArg, opArg);

	rewriter.clone(*loopOp.begin(), mapper);
	}

	void rewriteToDistributeParallelDo(
	mlir::omp::LoopOp loopOp,
	mlir::ConversionPatternRewriter &rewriter) const {
	mlir::omp::ParallelOperands parallelClauseOps;
	parallelClauseOps.privateVars = loopOp.getPrivateVars();

	auto privateSyms = loopOp.getPrivateSyms();
	if (privateSyms)
	parallelClauseOps.privateSyms.assign(privateSyms->begin(),
	privateSyms->end());

	Fortran::common::openmp::EntryBlockArgs parallelArgs;
	parallelArgs.priv.vars = parallelClauseOps.privateVars;

	auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loopOp.getLoc(),
	parallelClauseOps);
	mlir::Block *parallelBlock =
	genEntryBlock(rewriter, parallelArgs, parallelOp.getRegion());
	parallelOp.setComposite(true);
	rewriter.setInsertionPoint(
	rewriter.create<mlir::omp::TerminatorOp>(loopOp.getLoc()));

	mlir::omp::DistributeOperands distributeClauseOps;
	auto distributeOp = rewriter.create<mlir::omp::DistributeOp>(
	loopOp.getLoc(), distributeClauseOps);
	distributeOp.setComposite(true);
	rewriter.createBlock(&distributeOp.getRegion());

	mlir::omp::WsloopOperands wsloopClauseOps;
	auto wsloopOp =
	rewriter.create<mlir::omp::WsloopOp>(loopOp.getLoc(), wsloopClauseOps);
	wsloopOp.setComposite(true);
	rewriter.createBlock(&wsloopOp.getRegion());

	mlir::IRMapping mapper;
	mlir::Block &loopBlock = *loopOp.getRegion().begin();

	for (auto [loopOpArg, parallelOpArg] : llvm::zip_equal(
	loopBlock.getArguments(), parallelBlock->getArguments()))
	mapper.map(loopOpArg, parallelOpArg);

	rewriter.clone(*loopOp.begin(), mapper);
	}
	};

	class GenericLoopConversionPass
	: public flangomp::impl::GenericLoopConversionPassBase<
	GenericLoopConversionPass> {
	public:
	GenericLoopConversionPass() = default;

	void runOnOperation() override {
	mlir::func::FuncOp func = getOperation();

	if (func.isDeclaration())
	return;

	mlir::MLIRContext *context = &getContext();
	mlir::RewritePatternSet patterns(context);
	patterns.insert<GenericLoopConversionPattern>(context);
	mlir::ConversionTarget target(*context);

	target.markUnknownOpDynamicallyLegal(
	[](mlir::Operation *) { return true; });
	target.addDynamicallyLegalOp<mlir::omp::LoopOp>(
	[](mlir::omp::LoopOp loopOp) {
	return mlir::failed(
	GenericLoopConversionPattern::checkLoopConversionSupportStatus(
	loopOp));
	});

	if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
	std::move(patterns)))) {
	mlir::emitError(func.getLoc(), "error in converting `omp.loop` op");
	signalPassFailure();
	}
	}
	};
	} // namespace