| //===- Transforms.h - SCF dialect transformation utilities ------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This header file defines transformations on SCF operations. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef MLIR_DIALECT_SCF_TRANSFORMS_H_ |
| #define MLIR_DIALECT_SCF_TRANSFORMS_H_ |
| |
| #include "mlir/Support/LLVM.h" |
| #include "llvm/ADT/ArrayRef.h" |
| |
| namespace mlir { |
| |
| class AffineMap; |
| class ConversionTarget; |
| struct LogicalResult; |
| class MLIRContext; |
| class Region; |
| class RewriterBase; |
| class TypeConverter; |
| class RewritePatternSet; |
| using OwningRewritePatternList = RewritePatternSet; |
| class Operation; |
| class Value; |
| class ValueRange; |
| |
| namespace scf { |
| |
| class IfOp; |
| class ForOp; |
| class ParallelOp; |
| class ForOp; |
| |
| /// Match "for loop"-like operations: If the first parameter is an iteration |
| /// variable, return lower/upper bounds via the second/third parameter and the |
| /// step size via the last parameter. The function should return `success` in |
| /// that case. If the first parameter is not an iteration variable, return |
| /// `failure`. |
| using LoopMatcherFn = |
| function_ref<LogicalResult(Value, Value &, Value &, Value &)>; |
| |
| /// Try to canonicalize an min/max operations in the context of for `loops` with |
| /// a known range. |
| /// |
| /// `map` is the body of the min/max operation and `operands` are the SSA values |
| /// that the dimensions and symbols are bound to; dimensions are listed first. |
| /// If `isMin`, the operation is a min operation; otherwise, a max operation. |
| /// `loopMatcher` is used to retrieve loop bounds and the step size for a given |
| /// iteration variable. |
| /// |
| /// Note: `loopMatcher` allows this function to be used with any "for loop"-like |
| /// operation (scf.for, scf.parallel and even ops defined in other dialects). |
| LogicalResult canonicalizeMinMaxOpInLoop(RewriterBase &rewriter, Operation *op, |
| AffineMap map, ValueRange operands, |
| bool isMin, LoopMatcherFn loopMatcher); |
| |
| /// Fuses all adjacent scf.parallel operations with identical bounds and step |
| /// into one scf.parallel operations. Uses a naive aliasing and dependency |
| /// analysis. |
| void naivelyFuseParallelOps(Region ®ion); |
| |
| /// Rewrite a for loop with bounds/step that potentially do not divide evenly |
| /// into a for loop where the step divides the iteration space evenly, followed |
| /// by another scf.for for the last (partial) iteration (if any; returned via |
| /// `partialIteration`). This transformation is called "loop peeling". |
| /// |
| /// This transformation is beneficial for a wide range of transformations such |
| /// as vectorization or loop tiling: It enables additional canonicalizations |
| /// inside the peeled loop body such as rewriting masked loads into unmaked |
| /// loads. |
| /// |
| /// E.g., assuming a lower bound of 0 (for illustration purposes): |
| /// ``` |
| /// scf.for %iv = %c0 to %ub step %c4 { |
| /// (loop body) |
| /// } |
| /// ``` |
| /// is rewritten into the following pseudo IR: |
| /// ``` |
| /// %newUb = %ub - (%ub mod %c4) |
| /// scf.for %iv = %c0 to %newUb step %c4 { |
| /// (loop body) |
| /// } |
| /// scf.for %iv2 = %newUb to %ub { |
| /// (loop body) |
| /// } |
| /// ``` |
| /// |
| /// After loop peeling, this function tries to simplify/canonicalize affine.min |
| /// and affine.max ops in the body of the peeled loop and in the body of the |
| /// partial iteration loop, taking advantage of the fact that the peeled loop |
| /// has only "full" iterations. This canonicalization is expected to enable |
| /// further canonicalization opportunities through other patterns. |
| /// |
| /// The return value indicates whether the loop was rewritten or not. Loops are |
| /// not rewritten if: |
| /// * Loop step size is 1 or |
| /// * Loop bounds and step size are static, and step already divides the |
| /// iteration space evenly. |
| /// |
| /// Note: This function rewrites the given scf.for loop in-place and creates a |
| /// new scf.for operation for the last iteration. It replaces all uses of the |
| /// unpeeled loop with the results of the newly generated scf.for. |
| LogicalResult peelAndCanonicalizeForLoop(RewriterBase &rewriter, ForOp forOp, |
| scf::ForOp &partialIteration); |
| |
| /// Try to simplify a min/max operation `op` after loop peeling. This function |
| /// can simplify min/max operations such as (ub is the previous upper bound of |
| /// the unpeeled loop): |
| /// ``` |
| /// #map = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)> |
| /// %r = affine.min #affine.min #map(%iv)[%step, %ub] |
| /// ``` |
| /// and rewrites them into (in the case the peeled loop): |
| /// ``` |
| /// %r = %step |
| /// ``` |
| /// min/max operations inside the partial iteration are rewritten in a similar |
| /// way. |
| LogicalResult rewritePeeledMinMaxOp(RewriterBase &rewriter, Operation *op, |
| AffineMap map, ValueRange operands, |
| bool isMin, Value iv, Value ub, Value step, |
| bool insideLoop); |
| |
| /// Tile a parallel loop of the form |
| /// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) |
| /// step (%arg4, %arg5) |
| /// |
| /// into |
| /// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) |
| /// step (%arg4*tileSize[0], |
| /// %arg5*tileSize[1]) |
| /// scf.parallel (%j0, %j1) = (0, 0) to (min(tileSize[0], %arg2-%j0) |
| /// min(tileSize[1], %arg3-%j1)) |
| /// step (%arg4, %arg5) |
| /// The old loop is replaced with the new one. |
| /// |
| /// The function returns the resulting ParallelOps, i.e. {outer_loop_op, |
| /// inner_loop_op}. |
| std::pair<ParallelOp, ParallelOp> |
| tileParallelLoop(ParallelOp op, llvm::ArrayRef<int64_t> tileSizes, |
| bool noMinMaxBounds); |
| |
| /// Populates patterns for SCF structural type conversions and sets up the |
| /// provided ConversionTarget with the appropriate legality configuration for |
| /// the ops to get converted properly. |
| /// |
| /// A "structural" type conversion is one where the underlying ops are |
| /// completely agnostic to the actual types involved and simply need to update |
| /// their types. An example of this is scf.if -- the scf.if op and the |
| /// corresponding scf.yield ops need to update their types accordingly to the |
| /// TypeConverter, but otherwise don't care what type conversions are happening. |
| void populateSCFStructuralTypeConversionsAndLegality( |
| TypeConverter &typeConverter, RewritePatternSet &patterns, |
| ConversionTarget &target); |
| |
| /// Options to dictate how loops should be pipelined. |
| struct PipeliningOption { |
| /// Lambda returning all the operation in the forOp, with their stage, in the |
| /// order picked for the pipelined loop. |
| using GetScheduleFnType = std::function<void( |
| scf::ForOp, std::vector<std::pair<Operation *, unsigned>> &)>; |
| GetScheduleFnType getScheduleFn; |
| // TODO: add option to decide if the prologue/epilogue should be peeled. |
| }; |
| |
| /// Populate patterns for SCF software pipelining transformation. |
| /// This transformation generates the pipelined loop and doesn't do any |
| /// assumptions on the schedule dictated by the option structure. |
| /// Software pipelining is usually done in two part. The first part of |
| /// pipelining is to schedule the loop and assign a stage and cycle to each |
| /// operations. This is highly dependent on the target and is implemented as an |
| /// heuristic based on operation latencies, and other hardware characteristics. |
| /// The second part is to take the schedule and generate the pipelined loop as |
| /// well as the prologue and epilogue. It is independent of the target. |
| /// This pattern only implement the second part. |
| /// For example if we break a loop into 3 stages named S0, S1, S2 we would |
| /// generate the following code with the number in parenthesis the iteration |
| /// index: |
| /// S0(0) // Prologue |
| /// S0(1) S1(0) // Prologue |
| /// scf.for %I = %C0 to %N - 2 { |
| /// S0(I+2) S1(I+1) S2(I) // Pipelined kernel |
| /// } |
| /// S1(N) S2(N-1) // Epilogue |
| /// S2(N) // Epilogue |
| void populateSCFLoopPipeliningPatterns(RewritePatternSet &patterns, |
| const PipeliningOption &options); |
| |
| /// Populate patterns for canonicalizing operations inside SCF loop bodies. |
| /// At the moment, only affine.min/max computations with iteration variables, |
| /// loop bounds and loop steps are canonicalized. |
| void populateSCFForLoopCanonicalizationPatterns(RewritePatternSet &patterns); |
| |
| } // namespace scf |
| } // namespace mlir |
| |
| #endif // MLIR_DIALECT_SCF_TRANSFORMS_H_ |