blob: 55fb8bdca79147d42d68833fe1c8e741c66cc667 [file] [log] [blame]
//===- Utils.h - Utilities to support the Linalg dialect --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_DIALECT_LINALG_UTILS_H_
#define MLIR_DIALECT_LINALG_UTILS_H_
#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
namespace mlir {
class AffineExpr;
class AffineForOp;
class AffineMap;
class PatternRewriter;
namespace linalg {
class LinalgDependenceGraph;
//===----------------------------------------------------------------------===//
// General utilities
//===----------------------------------------------------------------------===//
/// Check if `permutation` is a permutation of the range
/// `[0, permutation.size())`.
bool isPermutation(ArrayRef<int64_t> permutation);
/// Apply the permutation defined by `permutation` to `inVec`.
/// Element `i` in `inVec` is mapped to location `j = permutation[i]`.
/// E.g.: for an input vector `inVec = ['a', 'b', 'c']` and a permutation vector
/// `permutation = [2, 0, 1]`, this function leaves `inVec = ['c', 'a', 'b']`.
template <typename T, unsigned N>
void applyPermutationToVector(SmallVector<T, N> &inVec,
ArrayRef<int64_t> permutation) {
SmallVector<T, N> auxVec(inVec.size());
for (auto en : enumerate(permutation))
auxVec[en.index()] = inVec[en.value()];
inVec = auxVec;
}
/// Helper function that creates a memref::DimOp or tensor::DimOp depending on
/// the type of `source`.
Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source, int64_t dim);
/// Given an operation, retrieves the value of each dynamic dimension through
/// constructing the necessary DimOp operators.
SmallVector<Value, 4> getDynOperands(Location loc, Value val, OpBuilder &b);
/// Computes an upper bound for the result `value` of an index computation.
/// Translates AffineMinOps and AffineApplyOps along the use-def chains of the
/// index computation to affine constraints and projects out intermediate
/// values. The method sets `boundMap` to an affine map that given
/// `boundOperands` evaluates to an upper bound for the index computation.
///
/// Example:
/// ```
/// %dim0 = dim %tensor, %c0
/// %dim1 = dim %tensor, %c1
/// %0 = affine.min affine.map<(d0) -> (40, d0)> (%dim0)
/// %1 = affine.apply affine.map<(d0, d1) -> (d0 + d1)> (%0, %dim1)
/// ```
/// getUpperBoundForIndex(%1, boundMap, boundOperands)
/// set the output parameters to:
/// - boundMap = affine.map<(d0) -> (d0 + 40)>
/// - boundOperands = [%dim1]
void getUpperBoundForIndex(Value value, AffineMap &boundMap,
SmallVectorImpl<Value> &boundOperands);
/// Returns a constant upper bound for the result `value` of an index
/// computation. Calls `getUpperBoundForIndex` and returns a constant upper
/// bound if the result of `boundMap` is a constant expression and failure
/// otherwise.
///
/// Example:
/// ```
/// %0 = affine.min affine.map<(d0) -> (40, d0)> (%d0)
/// %1 = affine.apply affine.map<(d0) -> (d0 + 2)> (%0)
/// ```
/// getConstantUpperBoundForIndex(%1) returns 42
/// (boundsMap = affine.map<() -> (42)>)
FailureOr<int64_t> getConstantUpperBoundForIndex(Value value);
/// Create an ExtractSliceOp and, if `source` is defined by an ExtractSliceOp,
/// fold it by adding the offsets.
///
/// Example:
/// ```
/// %0 = tensor.extract_slice %arg0[3, 4][3, 32][1, 1] : tensor<64x64xf32> to
/// tensor<3x32xf32>
/// %1 = tensor.extract_slice %0[0, 5][3, 4][1, 1] : tensor<3x32xf32> to
/// tensor<3x4xf32>
/// ```
/// folds into:
/// ```
/// %1 = tensor.extract_slice %arg0[3, 9][3, 4][1, 1] : tensor<64x64xf32> to
/// tensor<3x4xf32>
/// ```
tensor::ExtractSliceOp makeComposedExtractSliceOp(
OpBuilder &b, Location loc, Value source, ArrayRef<OpFoldResult> offsets,
ArrayRef<OpFoldResult> sizes, ArrayRef<OpFoldResult> strides);
/// Create a PadTensorOp that pads `source` to the size of the statically sized
/// `type` whose static sizes are assumed to be greater than the dynamic
/// `source` size. The padding introduces trailing `pad` values until the target
/// size is met. If `source` is defined by one or more LinalgOps that have been
/// padded with the same value and sizes, return their padded result instead of
/// creating a PadTensorOp.
///
/// Example:
/// ```
/// %0 = tensor.extract_slice %arg0 [%iv0, %iv1] [%sz0, %sz1]
/// %1 = linalg.pad_tensor %0 low[0, 0] high[...] { linalg.yield %cst }
/// %2 = linalg.matmul ins(...) outs(%1)
/// %3 = tensor.extract_slice %2 [0, 0] [%sz0, %sz1]
/// ```
/// makeComposedPadHighOp(source=%3, pad=%cst) returns %2
/// makeComposedPadHighOp(source=%3, pad=%other_cst) returns %4
/// ```
/// %4 = linalg.pad_tensor %3 low[0, 0] high[...] { linalg.yield %other_cst }
/// ```
Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
Value source, Value pad, bool nofold);
//===----------------------------------------------------------------------===//
// Fusion / Tiling utilities
//===----------------------------------------------------------------------===//
/// The type of loops to be generated during tiling.
enum class LinalgTilingLoopType {
Loops = 0,
AffineLoops = 1,
ParallelLoops = 2,
TiledLoops = 3,
};
/// Checks whether the specific `producer` is the last write to exactly the
/// whole `consumedView`. This checks structural dominance, that the dependence
/// is a RAW without any interleaved write to any piece of `consumedView`.
bool isProducerLastWriteOfView(const LinalgDependenceGraph &graph,
LinalgOp consumer, Value consumedView,
LinalgOp producer);
/// Checks whether fusing the specific `producer` of the `consumedView` is
/// feasible. This checks `producer` is the last write of `consumedView` and
/// that no interleaved dependence would be violated (RAW, WAR or WAW).
bool isFusableInto(const LinalgDependenceGraph &graph, LinalgOp consumer,
Value consumedView, LinalgOp producer);
/// Compute tile offsets, given a list of loop `ivs` and `tileSizes`. In case a
/// tile size is zero (i.e., no tiling), the corresponding offset is also zero.
SmallVector<Value> computeTileOffsets(OpBuilder &b, Location loc,
ValueRange ivs, ValueRange tileSizes);
/// Compute tile sizes, given a list of loop `ivs`, `tileSizes` and dimension
/// sizes (`sizeBounds`). In case a tile size is zero (i.e., no tiling), the
/// corresponding result size is the corresponding value from `sizeBounds`.
/// Note: The returned tile sizes are closed intervals.
SmallVector<Value> computeTileSizes(OpBuilder &b, Location loc, ValueRange ivs,
ValueRange tileSizes,
ArrayRef<Value> sizeBounds);
/// Creates an extract_slice/subview op for a single `valueToTile` with
/// `builder`. This new operation extracts a tile of `valueToTile`, starting
/// at offsets `lbs` and with sizes `subShapeSizes`.
Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
ValueRange tileSizes, AffineMap map, ValueRange lbs,
ValueRange ubs, ValueRange subShapeSizes);
/// Creates extract_slice/subview ops for all `valuesToTile` of the given
/// `linalgOp` with `builder`, assuming `linalgOp` is being fused into a loop
/// nest for tiling with the given induction variables `ivs` and tile sizes
/// `tileSizes`. `sizeBounds` are the iteration space bounds for *all* the
/// implicit loops in `linalgOp`.
///
/// Note that a constant zero in `tileSizes` means no tiling at that implicit
/// loop. The number of non-zero values in `tileSizes` should be equal to the
/// number of values in `ivs`.
SmallVector<Value, 4> makeTiledShapes(OpBuilder &builder, Location loc,
LinalgOp linalgOp,
ArrayRef<Value> valuesToTile,
ValueRange ivs, ValueRange tileSizes,
ArrayRef<Value> sizeBounds);
/// Add the tile loop induction variables `ivs` to the IndexOp results found in
/// the body of the `tiledOp` to account for the tile offset.
void addTileLoopIvsToIndexOpResults(OpBuilder &b, LinalgOp tiledOp,
ArrayRef<Value> ivs);
using FusableOpDependencesTy = llvm::MapVector<
Operation *,
SmallVector<LinalgDependenceGraph::LinalgDependenceGraphElem, 1>>;
FusableOpDependencesTy
findAllFusableDependences(ArrayRef<LinalgOp> ops,
const LinalgDependenceGraph &dependenceGraph);
/// A struct containing the Linalg producer before and after fusion.
/// When operating on tensors, `fusedProducer` may feed into a `tensor.cast` op
/// before the consumer Linalg op, until enough canonicalizations have applied.
struct FusionInfo {
LinalgOp originalProducer;
LinalgOp fusedProducer;
};
/// Fuses producer into consumer if the producer is structurally feasible and
/// the fusion would not violate dependencies.
/// Implements the fusion part of the "tileAndFuse on buffers" transformation
/// and thus requires the `consumerOpOperand` to be a `subview` op (generally
/// obtained by applying the tiling transformation).
FailureOr<FusionInfo> fuseProducerOfBuffer(OpBuilder &b,
OpOperand &consumerOpOperand,
const LinalgDependenceGraph &graph);
/// Tensor counterpart of `fuseProducerOfBuffer`.
/// This implements the fusion part of the "tileAndFuse on tensors"
/// transformation and thus requires the `consumerOpOperand` to be a
/// `extract_slice` op (generally obtained by applying the tiling
/// transformation).
FailureOr<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
OpOperand &consumerOpOperand);
/// Tensor counterpart of `fuseProducerOfBuffer`.
/// This implements the fusion part of the "tileAndFuse on tensors"
/// transformation and thus requires the `consumerOpOperand` to be a
/// `extract_slice` op (generally obtained by applying the tiling
/// transformation). Assumes `producerOfTensor` is a Linalg op that produces
/// `consumerOpOperand`.
FailureOr<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
OpResult producerOpResult,
OpOperand &consumerOpOperand);
//===----------------------------------------------------------------------===//
// Fusion on tensor utilities
//===----------------------------------------------------------------------===//
/// A struct to manage the tile loop nest specific information.
class TileLoopNest {
public:
TileLoopNest(LinalgOp rootOp) : rootOp(rootOp) {}
/// Tile the root operation using the given `tileSizes` and `tileInterchange`.
LogicalResult tileRootOp(OpBuilder &b, ArrayRef<int64_t> tileSizes,
ArrayRef<int64_t> tileInterchange);
/// Fuse the producer of `consumerOpOperand` into the tile loop nest. Returns
/// the fused producer or fails if fusion is not possible.
FailureOr<LinalgOp> fuseProducer(OpBuilder &b, OpOperand *consumerOpOperand);
/// Returns the replacement results for the original untiled root operation.
ValueRange getRootOpReplacementResults();
/// Returns the tiled root operation.
LinalgOp getRootOp() { return rootOp; }
/// Returns the tiled root operation and the fused producers.
SmallVector<LinalgOp> getAllTiledAndFusedOps();
/// Returns the loop ops generated from tiling.
ArrayRef<scf::ForOp> getLoopOps() { return tileLoopOps; }
private:
/// Returns true if the tile loop nest has no tile loops.
bool isEmpty();
/// Returns true if the tile loop nest invariants are satisfied:
/// - The `rootOp` has been tiled at least once.
/// - The number of tile loop operations and dimensions match.
/// - The innermost tile loop is the parent of `tiledOp`.
/// - The tile loops are directly nested.
// TODO: relax to support additional control flow, e.g., IfOp.
bool isValid();
/// Searches the block arguments tied to a block argument `bbArg` of the
/// innermost tile loop. Returns the block argument from outermost to
/// innermost or an empty vector if none are found.
SmallVector<BlockArgument> getTiedBBArgs(BlockArgument bbArg);
/// Returns the iteration argument of the outermost tile loop mapped to a
/// block argument `bbArg` of the innermost tile loop.
OpOperand *getTiedIterArg(BlockArgument bbArg);
/// Returns true if `bbArg` has other used than `sliceOp` and its
/// dependencies. Only if there are no other uses, the producer output
/// iteration argument may reused to pass the producer result after fusion.
bool hasOtherUses(BlockArgument bbArg, tensor::ExtractSliceOp sliceOp);
LinalgOp rootOp;
SmallVector<scf::ForOp> tileLoopOps;
DenseMap<Operation *, SmallVector<int64_t>> tiledRootAndFusedOpsLoops;
};
/// Tiles `consumerOp` and fuses its dependencies if possible. Uses the
/// `tileSizes` and `tileInterchange` parameters to control the tiling.
FailureOr<TileLoopNest>
tileConsumerAndFuseProducers(OpBuilder &b, LinalgOp consumerOp,
ArrayRef<int64_t> tileSizes,
ArrayRef<int64_t> tileInterchange);
//===----------------------------------------------------------------------===//
// Distribution utilities
//===----------------------------------------------------------------------===//
/// Scheme used to distribute loops to processors.
enum class DistributionMethod {
/// Cyclic distribution where no assumption is made about the dynamic
/// relationship between number of processors and number of iterations of the
/// distributed loop. Distributes the following loop
///
/// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
///
/// to
///
/// scf.parallel(%iv)= (%lb + %procId * %step) to (%ub) step (%step * %nprocs)
Cyclic = 0,
/// Cyclic distribution where the number of processors can be assumed to be
/// more than or equal to the number of iterations of the distributed loop. In
/// such cases, a simple in-bounds check is enough (instead of materializing a
/// loop). Distributes the following loop
///
/// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
///
/// to
///
/// %iv = %lb + %procId * %step
/// %cond = arith.cmpi "slt", %iv, %ub
/// scf.if %cond {
/// ...
/// }
CyclicNumProcsGeNumIters = 1,
/// Cyclic distribution where the number of processors can be assumed to be
/// equal to the number of iterations of the distributed loop. In such cases,
/// no bounds check is needed. Distributes the following loop
///
/// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
///
/// to
///
/// %iv = %lb + %procId * %step
CyclicNumProcsEqNumIters = 2
};
/// Callback function type used to get processor ID, and number of processors
/// used for distribution for all parallel loops generated.
struct ProcInfo {
Value procId;
Value nprocs;
};
using ProcInfoCallBackFn = std::function<SmallVector<ProcInfo, 2>(
OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges)>;
using OneDimProcInfoCallBackFn =
std::function<ProcInfo(OpBuilder &b, Location loc)>;
/// Options that allow distribution of loops generated in Linalg transforms to
/// processors while generating the loops.
struct LinalgLoopDistributionOptions {
/// Callback function that returns the Values for processor ID (`procId`), and
/// number of processors (`nprocs`) used to execute the parallel loops. The
/// number of `{procId, nprocs}` pairs returned must be equal to the number of
/// `parallelLoopRanges` passed into the callback, which in-turn is same as
/// the number of parallel loops for which the `distributionMethod` is
/// specified below.
ProcInfoCallBackFn procInfo;
/// Specification of how to distribute the `scf.parallel` loops that are
/// generated. As the `scf.parallel` loop is generated, the elements of this
/// vector is used (from left to right) and the specified distribution is
/// applied. If the vector is less than the number of `scf.parallel` loops
/// generated, then no distribution is applied.
SmallVector<DistributionMethod, 0> distributionMethod = {};
/// The map keyed by the distribution type that contains callback functions
/// that return the Values for processor ID (`procId`), and number of
/// processors (`nprocs`) used to execute the parallel loops.
DenseMap<StringRef, OneDimProcInfoCallBackFn> procInfoMap;
};
/// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`.
void updateBoundsForCyclicDistribution(OpBuilder &builder, Location loc,
Value procId, Value nprocs, Value &lb,
Value &ub, Value &step);
//===----------------------------------------------------------------------===//
// Generic op region utilities
//===----------------------------------------------------------------------===//
/// A struct containing common matchers over linalg op's region.
struct RegionMatcher {
enum class BinaryOpKind {
IAdd,
};
/// Matches the given linalg op if its body is performing binary operation on
/// int or float scalar values and returns the binary op kind.
///
/// The linalg op's region is expected to be
/// ```
/// {
/// ^bb(%a: <scalar-type>, %b: <scalar-type>):
/// %0 = <binary-op> %a, %b: <scalar-type>
/// linalg.yield %0: <scalar-type>
/// }
/// ```
static Optional<BinaryOpKind> matchAsScalarBinaryOp(GenericOp op);
};
//===----------------------------------------------------------------------===//
// Loop nest utilities
//===----------------------------------------------------------------------===//
/// Utility class used to generate nested loops with ranges described by
/// `loopRanges` and loop type described by the `iteratorTypes`. `bodyBuilderFn`
/// is used to generate the body of the innermost loop. It is passed a range
/// of loop induction variables and a range of operand values to use.
template <typename LoopTy>
struct GenerateLoopNest {
static void doit(OpBuilder &b, Location loc, ArrayRef<Range> loopRanges,
LinalgOp linalgOp, ArrayRef<Attribute> iteratorTypes,
function_ref<scf::ValueVector(OpBuilder &, Location,
ValueRange, ValueRange)>
bodyBuilderFn,
Optional<LinalgLoopDistributionOptions> = None,
ArrayRef<StringRef> distributionTypes = {});
};
} // namespace linalg
} // namespace mlir
#endif // MLIR_DIALECT_LINALG_UTILS_H_