blob: b063e6e775e63402990b666e93c280a3accf239d [file] [log] [blame]
//===- Transforms.h - SCF dialect transformation utilities ------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This header file defines transformations on SCF operations.
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_DIALECT_SCF_TRANSFORMS_TRANSFORMS_H_
#define MLIR_DIALECT_SCF_TRANSFORMS_TRANSFORMS_H_
#include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/ArrayRef.h"
namespace mlir {
struct LogicalResult;
class Region;
class RewriterBase;
class Operation;
class Value;
namespace scf {
class IfOp;
class ForallOp;
class ForOp;
class ParallelOp;
class WhileOp;
/// Try converting scf.forall into a set of nested scf.for loops.
/// The newly created scf.for ops will be returned through the `results`
/// vector if provided.
LogicalResult forallToForLoop(RewriterBase &rewriter, ForallOp forallOp,
SmallVectorImpl<Operation *> *results = nullptr);
/// Fuses all adjacent scf.parallel operations with identical bounds and step
/// into one scf.parallel operations. Uses a naive aliasing and dependency
/// analysis.
/// User can additionally customize alias checking with `mayAlias` hook.
/// `mayAlias` must return false if 2 values are guaranteed to not alias.
void naivelyFuseParallelOps(Region &region,
llvm::function_ref<bool(Value, Value)> mayAlias);
/// Rewrite a for loop with bounds/step that potentially do not divide evenly
/// into a for loop where the step divides the iteration space evenly, followed
/// by another scf.for for the last (partial) iteration (if any; returned via
/// `partialIteration`). This transformation is called "loop peeling".
///
/// This transformation is beneficial for a wide range of transformations such
/// as vectorization or loop tiling: It enables additional canonicalizations
/// inside the peeled loop body such as rewriting masked loads into unmaked
/// loads.
///
/// E.g., assuming a lower bound of 0 (for illustration purposes):
/// ```
/// scf.for %iv = %c0 to %ub step %c4 {
/// (loop body)
/// }
/// ```
/// is rewritten into the following pseudo IR:
/// ```
/// %newUb = %ub - (%ub mod %c4)
/// scf.for %iv = %c0 to %newUb step %c4 {
/// (loop body)
/// }
/// scf.for %iv2 = %newUb to %ub {
/// (loop body)
/// }
/// ```
///
/// After loop peeling, this function tries to simplify affine.min and
/// affine.max ops in the body of the peeled loop and in the body of the partial
/// iteration loop, taking advantage of the fact that the peeled loop has only
/// "full" iterations. This simplification is expected to enable further
/// canonicalization opportunities through other patterns.
///
/// The return value indicates whether the loop was rewritten or not. Loops are
/// not rewritten if:
/// * Loop step size is 1 or
/// * Loop bounds and step size are static, and step already divides the
/// iteration space evenly.
///
/// Note: This function rewrites the given scf.for loop in-place and creates a
/// new scf.for operation for the last iteration. It replaces all uses of the
/// unpeeled loop with the results of the newly generated scf.for.
LogicalResult peelForLoopAndSimplifyBounds(RewriterBase &rewriter, ForOp forOp,
scf::ForOp &partialIteration);
/// Peel the first iteration out of the scf.for loop. If there is only one
/// iteration, return the original loop.
LogicalResult peelForLoopFirstIteration(RewriterBase &rewriter, ForOp forOp,
scf::ForOp &partialIteration);
/// Tile a parallel loop of the form
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
/// step (%arg4, %arg5)
///
/// into
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
/// step (%arg4*tileSize[0],
/// %arg5*tileSize[1])
/// scf.parallel (%j0, %j1) = (0, 0) to (min(tileSize[0], %arg2-%j0)
/// min(tileSize[1], %arg3-%j1))
/// step (%arg4, %arg5)
/// The old loop is replaced with the new one.
///
/// The function returns the resulting ParallelOps, i.e. {outer_loop_op,
/// inner_loop_op}.
std::pair<ParallelOp, ParallelOp>
tileParallelLoop(ParallelOp op, llvm::ArrayRef<int64_t> tileSizes,
bool noMinMaxBounds);
/// Options to dictate how loops should be pipelined.
struct PipeliningOption {
/// Lambda returning all the operation in the forOp, with their stage, in the
/// order picked for the pipelined loop.
using GetScheduleFnType = std::function<void(
scf::ForOp, std::vector<std::pair<Operation *, unsigned>> &)>;
GetScheduleFnType getScheduleFn = nullptr;
enum class PipelinerPart {
Prologue,
Kernel,
Epilogue,
};
/// Lambda called by the pipeliner to allow the user to annotate the IR while
/// it is generated.
/// The callback passes the operation created along with the part of the
/// pipeline and the iteration index. The iteration index is always 0 for the
/// kernel. For the prologue and epilogue, it corresponds to the iteration
/// peeled out of the loop in the range [0, maxStage[.
using AnnotationlFnType =
std::function<void(Operation *, PipelinerPart, unsigned)>;
AnnotationlFnType annotateFn = nullptr;
/// Control whether the epilogue should be peeled out of the loop or
/// operations should be predicated to skip the early stages in the last loop
/// iterations. If the epilogue is predicated; the user needs to provide a
/// lambda to generate the predicated version of operations.
bool peelEpilogue = true;
/// Control whether the transformation checks that the number of iterations is
/// greater or equal to the number of stages and skip the transformation if
/// this is not the case. If the loop is dynamic and this is set to true and
/// the loop bounds are not static the pipeliner will have to predicate
/// operations in the the prologue/epilogue.
bool supportDynamicLoops = false;
// Callback to predicate operations when the prologue or epilogue are not
// peeled. This takes the original operation, an i1 predicate value and the
// pattern rewriter. It is expected to replace the given operation with
// the predicated equivalent and return it, or return nullptr if the
// predication is impossible. In the latter case, pipelining will fail and
// may leave IR in a partially transformed state.
using PredicateOpFn =
std::function<Operation *(RewriterBase &, Operation *, Value)>;
PredicateOpFn predicateFn = nullptr;
// TODO: add option to decide if the prologue should be peeled.
};
/// Generate a pipelined version of the scf.for loop based on the schedule given
/// as option. This applies the mechanical transformation of changing the loop
/// and generating the prologue/epilogue for the pipelining and doesn't make any
/// decision regarding the schedule.
/// Based on the options the loop is split into several stages.
/// The transformation assumes that the scheduling given by user is valid.
/// For example if we break a loop into 3 stages named S0, S1, S2 we would
/// generate the following code with the number in parenthesis as the iteration
/// index:
///
/// S0(0) // Prologue
/// S0(1) S1(0) // Prologue
/// scf.for %I = %C0 to %N - 2 {
/// S0(I+2) S1(I+1) S2(I) // Pipelined kernel
/// }
/// S1(N) S2(N-1) // Epilogue
/// S2(N) // Epilogue
///
/// If `modifiedIR` is provided, it will be set to a value that indicates
/// whether pipelining modified the IR before failing, signaling to the caller
/// whether they can proceed with different transformations.
FailureOr<ForOp> pipelineForLoop(RewriterBase &rewriter, ForOp forOp,
const PipeliningOption &options,
bool *modifiedIR = nullptr);
/// Create zero-trip-check around a `while` op and return the new loop op in the
/// check. The while loop is rotated to avoid evaluating the condition twice
///
/// By default the check won't be created for do-while loop as it is not
/// required. `forceCreateCheck` can force the creation.
///
/// It turns:
///
/// scf.while (%arg0 = %init) : (i32) -> i64 {
/// %val = .., %arg0 : i64
/// %cond = arith.cmpi .., %arg0 : i32
/// scf.condition(%cond) %val : i64
/// } do {
/// ^bb0(%arg1: i64):
/// %next = .., %arg1 : i32
/// scf.yield %next : i32
/// }
///
/// into:
///
/// %pre_val = .., %init : i64
/// %pre_cond = arith.cmpi .., %init : i32
/// scf.if %pre_cond -> i64 {
/// %res = scf.while (%arg1 = %va0) : (i64) -> i64 {
/// %next = .., %arg1 : i32
/// %val = .., %next : i64
/// %cond = arith.cmpi .., %next : i32
/// scf.condition(%cond) %val : i64
/// } do {
/// ^bb0(%arg2: i64):
/// %scf.yield %arg2 : i32
/// }
/// scf.yield %res : i64
/// } else {
/// scf.yield %pre_val : i64
/// }
FailureOr<WhileOp> wrapWhileLoopInZeroTripCheck(WhileOp whileOp,
RewriterBase &rewriter,
bool forceCreateCheck = false);
/// Try to uplift `scf.while` op to `scf.for`.
/// Uplifitng expects a specific ops pattern:
/// * `before` block consisting of single arith.cmp op
/// * `after` block containing arith.addi
FailureOr<ForOp> upliftWhileToForLoop(RewriterBase &rewriter, WhileOp loop);
} // namespace scf
} // namespace mlir
#endif // MLIR_DIALECT_SCF_TRANSFORMS_TRANSFORMS_H_