mlir/include/mlir/Dialect/Linalg/Utils/Utils.h - llvm-project - Git at Google

 //===- Utils.h - Utilities to support the Linalg dialect --------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #ifndef MLIR_DIALECT_LINALG_UTILS_H_
 #define MLIR_DIALECT_LINALG_UTILS_H_

 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"

 namespace mlir {
 class AffineExpr;
 class AffineForOp;
 class AffineMap;
 class PatternRewriter;

 namespace linalg {
 class LinalgDependenceGraph;

 //===----------------------------------------------------------------------===//
 // General utilities
 //===----------------------------------------------------------------------===//

 /// Check if `permutation` is a permutation of the range
 /// `[0, permutation.size())`.
 bool isPermutation(ArrayRef<int64_t> permutation);

 /// Apply the permutation defined by `permutation` to `inVec`.
 /// Element `i` in `inVec` is mapped to location `j = permutation[i]`.
 /// E.g.: for an input vector `inVec = ['a', 'b', 'c']` and a permutation vector
 /// `permutation = [2, 0, 1]`, this function leaves `inVec = ['c', 'a', 'b']`.
 template <typename T, unsigned N>
 void applyPermutationToVector(SmallVector<T, N> &inVec,
                               ArrayRef<int64_t> permutation) {
   SmallVector<T, N> auxVec(inVec.size());
   for (auto en : enumerate(permutation))
     auxVec[en.index()] = inVec[en.value()];
   inVec = auxVec;
 }

 /// Helper function that creates a memref::DimOp or tensor::DimOp depending on
 /// the type of `source`.
 Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source, int64_t dim);

 /// Given an operation, retrieves the value of each dynamic dimension through
 /// constructing the necessary DimOp operators.
 SmallVector<Value, 4> getDynOperands(Location loc, Value val, OpBuilder &b);

 /// Computes an upper bound for the result `value` of an index computation.
 /// Translates AffineMinOps and AffineApplyOps along the use-def chains of the
 /// index computation to affine constraints and projects out intermediate
 /// values. The method sets `boundMap` to an affine map that given
 /// `boundOperands` evaluates to an upper bound for the index computation.
 ///
 /// Example:
 /// ```
 /// %dim0 = dim %tensor, %c0
 /// %dim1 = dim %tensor, %c1
 /// %0 = affine.min affine.map<(d0) -> (40, d0)> (%dim0)
 /// %1 = affine.apply affine.map<(d0, d1) -> (d0 + d1)> (%0, %dim1)
 /// ```
 /// getUpperBoundForIndex(%1, boundMap, boundOperands)
 /// set the output parameters to:
 /// - boundMap = affine.map<(d0) -> (d0 + 40)>
 /// - boundOperands = [%dim1]
 void getUpperBoundForIndex(Value value, AffineMap &boundMap,
                            SmallVectorImpl<Value> &boundOperands);

 /// Returns a constant upper bound for the result `value` of an index
 /// computation. Calls `getUpperBoundForIndex` and returns a constant upper
 /// bound if the result of `boundMap` is a constant expression and failure
 /// otherwise.
 ///
 /// Example:
 /// ```
 /// %0 = affine.min affine.map<(d0) -> (40, d0)> (%d0)
 /// %1 = affine.apply affine.map<(d0) -> (d0 + 2)> (%0)
 /// ```
 /// getConstantUpperBoundForIndex(%1) returns 42
 /// (boundsMap = affine.map<() -> (42)>)
 FailureOr<int64_t> getConstantUpperBoundForIndex(Value value);

 /// Create an ExtractSliceOp and, if `source` is defined by an ExtractSliceOp,
 /// fold it by adding the offsets.
 ///
 /// Example:
 /// ```
 /// %0 = tensor.extract_slice %arg0[3, 4][3, 32][1, 1] : tensor<64x64xf32> to
 ///                                                        tensor<3x32xf32>
 /// %1 = tensor.extract_slice %0[0, 5][3, 4][1, 1] : tensor<3x32xf32> to
 ///                                                    tensor<3x4xf32>
 /// ```
 /// folds into:
 /// ```
 /// %1 = tensor.extract_slice %arg0[3, 9][3, 4][1, 1] : tensor<64x64xf32> to
 ///                                                       tensor<3x4xf32>
 /// ```
 tensor::ExtractSliceOp makeComposedExtractSliceOp(
     OpBuilder &b, Location loc, Value source, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes, ArrayRef<OpFoldResult> strides);

 /// Create a PadTensorOp that pads `source` to the size of the statically sized
 /// `type` whose static sizes are assumed to be greater than the dynamic
 /// `source` size. The padding introduces trailing `pad` values until the target
 /// size is met. If `source` is defined by one or more LinalgOps that have been
 /// padded with the same value and sizes, return their padded result instead of
 /// creating a PadTensorOp.
 ///
 /// Example:
 /// ```
 /// %0 = tensor.extract_slice %arg0 [%iv0, %iv1] [%sz0, %sz1]
 /// %1 = linalg.pad_tensor %0 low[0, 0] high[...] { linalg.yield %cst }
 /// %2 = linalg.matmul ins(...) outs(%1)
 /// %3 = tensor.extract_slice %2 [0, 0] [%sz0, %sz1]
 /// ```
 /// makeComposedPadHighOp(source=%3, pad=%cst) returns %2
 /// makeComposedPadHighOp(source=%3, pad=%other_cst) returns %4
 /// ```
 /// %4 = linalg.pad_tensor %3 low[0, 0] high[...] { linalg.yield %other_cst }
 /// ```
 Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
                             Value source, Value pad, bool nofold);

 //===----------------------------------------------------------------------===//
 // Fusion / Tiling utilities
 //===----------------------------------------------------------------------===//

 /// The type of loops to be generated during tiling.
 enum class LinalgTilingLoopType {
   Loops = 0,
   AffineLoops = 1,
   ParallelLoops = 2,
   TiledLoops = 3,
 };

 /// Checks whether the specific `producer` is the last write to exactly the
 /// whole `consumedView`. This checks structural dominance, that the dependence
 /// is a RAW without any interleaved write to any piece of `consumedView`.
 bool isProducerLastWriteOfView(const LinalgDependenceGraph &graph,
                                LinalgOp consumer, Value consumedView,
                                LinalgOp producer);

 /// Checks whether fusing the specific `producer` of the `consumedView` is
 /// feasible. This checks `producer` is the last write of `consumedView` and
 /// that no interleaved dependence would be violated (RAW, WAR or WAW).
 bool isFusableInto(const LinalgDependenceGraph &graph, LinalgOp consumer,
                    Value consumedView, LinalgOp producer);

 /// Compute tile offsets, given a list of loop `ivs` and `tileSizes`. In case a
 /// tile size is zero (i.e., no tiling), the corresponding offset is also zero.
 SmallVector<Value> computeTileOffsets(OpBuilder &b, Location loc,
                                       ValueRange ivs, ValueRange tileSizes);

 /// Compute tile sizes, given a list of loop `ivs`, `tileSizes` and dimension
 /// sizes (`sizeBounds`). In case a tile size is zero (i.e., no tiling), the
 /// corresponding result size is the corresponding value from `sizeBounds`.
 /// Note: The returned tile sizes are closed intervals.
 SmallVector<Value> computeTileSizes(OpBuilder &b, Location loc, ValueRange ivs,
                                     ValueRange tileSizes,
                                     ArrayRef<Value> sizeBounds);

 /// Creates an extract_slice/subview op for a single `valueToTile` with
 /// `builder`. This new operation extracts a tile of `valueToTile`, starting
 /// at offsets `lbs` and with sizes `subShapeSizes`.
 Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
                      ValueRange tileSizes, AffineMap map, ValueRange lbs,
                      ValueRange ubs, ValueRange subShapeSizes);

 /// Creates extract_slice/subview ops for all `valuesToTile` of the given
 /// `linalgOp` with `builder`, assuming `linalgOp` is being fused into a loop
 /// nest for tiling with the given induction variables `ivs` and tile sizes
 /// `tileSizes`. `sizeBounds` are the iteration space bounds for *all* the
 /// implicit loops in `linalgOp`.
 ///
 /// Note that a constant zero in `tileSizes` means no tiling at that implicit
 /// loop. The number of non-zero values in `tileSizes` should be equal to the
 /// number of values in `ivs`.
 SmallVector<Value, 4> makeTiledShapes(OpBuilder &builder, Location loc,
                                       LinalgOp linalgOp,
                                       ArrayRef<Value> valuesToTile,
                                       ValueRange ivs, ValueRange tileSizes,
                                       ArrayRef<Value> sizeBounds);

 /// Add the tile loop induction variables `ivs` to the IndexOp results found in
 /// the body of the `tiledOp` to account for the tile offset.
 void addTileLoopIvsToIndexOpResults(OpBuilder &b, LinalgOp tiledOp,
                                     ArrayRef<Value> ivs);

 using FusableOpDependencesTy = llvm::MapVector<
     Operation *,
     SmallVector<LinalgDependenceGraph::LinalgDependenceGraphElem, 1>>;
 FusableOpDependencesTy
 findAllFusableDependences(ArrayRef<LinalgOp> ops,
                           const LinalgDependenceGraph &dependenceGraph);

 /// A struct containing the Linalg producer before and after fusion.
 /// When operating on tensors, `fusedProducer` may feed into a `tensor.cast` op
 /// before the consumer Linalg op, until enough canonicalizations have applied.
 struct FusionInfo {
   LinalgOp originalProducer;
   LinalgOp fusedProducer;
 };

 /// Fuses producer into consumer if the producer is structurally feasible and
 /// the fusion would not violate dependencies.
 /// Implements the fusion part of the "tileAndFuse on buffers" transformation
 /// and thus requires the `consumerOpOperand` to be a `subview` op (generally
 /// obtained by applying the tiling transformation).
 FailureOr<FusionInfo> fuseProducerOfBuffer(OpBuilder &b,
                                            OpOperand &consumerOpOperand,
                                            const LinalgDependenceGraph &graph);
 /// Tensor counterpart of `fuseProducerOfBuffer`.
 /// This implements the fusion part of the "tileAndFuse on tensors"
 /// transformation and thus requires the `consumerOpOperand` to be a
 /// `extract_slice` op (generally obtained by applying the tiling
 /// transformation).
 FailureOr<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
                                            OpOperand &consumerOpOperand);
 /// Tensor counterpart of `fuseProducerOfBuffer`.
 /// This implements the fusion part of the "tileAndFuse on tensors"
 /// transformation and thus requires the `consumerOpOperand` to be a
 /// `extract_slice` op (generally obtained by applying the tiling
 /// transformation). Assumes `producerOfTensor` is a Linalg op that produces
 /// `consumerOpOperand`.
 FailureOr<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
                                            OpResult producerOpResult,
                                            OpOperand &consumerOpOperand);

 //===----------------------------------------------------------------------===//
 // Fusion on tensor utilities
 //===----------------------------------------------------------------------===//

 /// A struct to manage the tile loop nest specific information.
 class TileLoopNest {
 public:
   TileLoopNest(LinalgOp rootOp) : rootOp(rootOp) {}

   /// Tile the root operation using the given `tileSizes` and `tileInterchange`.
   LogicalResult tileRootOp(OpBuilder &b, ArrayRef<int64_t> tileSizes,
                            ArrayRef<int64_t> tileInterchange);

   /// Fuse the producer of `consumerOpOperand` into the tile loop nest. Returns
   /// the fused producer or fails if fusion is not possible.
   FailureOr<LinalgOp> fuseProducer(OpBuilder &b, OpOperand *consumerOpOperand);

   /// Returns the replacement results for the original untiled root operation.
   ValueRange getRootOpReplacementResults();

   /// Returns the tiled root operation.
   LinalgOp getRootOp() { return rootOp; }

   /// Returns the tiled root operation and the fused producers.
   SmallVector<LinalgOp> getAllTiledAndFusedOps();

   /// Returns the loop ops generated from tiling.
   ArrayRef<scf::ForOp> getLoopOps() { return tileLoopOps; }

 private:
   /// Returns true if the tile loop nest has no tile loops.
   bool isEmpty();

   /// Returns true if the tile loop nest invariants are satisfied:
   /// - The `rootOp` has been tiled at least once.
   /// - The number of tile loop operations and dimensions match.
   /// - The innermost tile loop is the parent of `tiledOp`.
   /// - The tile loops are directly nested.
   // TODO: relax to support additional control flow, e.g., IfOp.
   bool isValid();

   /// Searches the block arguments tied to a block argument `bbArg` of the
   /// innermost tile loop. Returns the block argument from outermost to
   /// innermost or an empty vector if none are found.
   SmallVector<BlockArgument> getTiedBBArgs(BlockArgument bbArg);

   /// Returns the iteration argument of the outermost tile loop mapped to a
   /// block argument `bbArg` of the innermost tile loop.
   OpOperand *getTiedIterArg(BlockArgument bbArg);

   /// Returns true if `bbArg` has other used than `sliceOp` and its
   /// dependencies. Only if there are no other uses, the producer output
   /// iteration argument may reused to pass the producer result after fusion.
   bool hasOtherUses(BlockArgument bbArg, tensor::ExtractSliceOp sliceOp);

   LinalgOp rootOp;
   SmallVector<scf::ForOp> tileLoopOps;
   DenseMap<Operation *, SmallVector<int64_t>> tiledRootAndFusedOpsLoops;
 };

 /// Tiles `consumerOp` and fuses its dependencies if possible. Uses the
 /// `tileSizes` and `tileInterchange` parameters to control the tiling.
 FailureOr<TileLoopNest>
 tileConsumerAndFuseProducers(OpBuilder &b, LinalgOp consumerOp,
                              ArrayRef<int64_t> tileSizes,
                              ArrayRef<int64_t> tileInterchange);

 //===----------------------------------------------------------------------===//
 // Distribution utilities
 //===----------------------------------------------------------------------===//

 /// Scheme used to distribute loops to processors.
 enum class DistributionMethod {
   /// Cyclic distribution where no assumption is made about the dynamic
   /// relationship between number of processors and number of iterations of the
   /// distributed loop. Distributes the following loop
   ///
   /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
   ///
   /// to
   ///
   /// scf.parallel(%iv)= (%lb + %procId * %step) to (%ub) step (%step * %nprocs)
   Cyclic = 0,

   /// Cyclic distribution where the number of processors can be assumed to be
   /// more than or equal to the number of iterations of the distributed loop. In
   /// such cases, a simple in-bounds check is enough (instead of materializing a
   /// loop). Distributes the following loop
   ///
   /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
   ///
   /// to
   ///
   /// %iv = %lb + %procId * %step
   /// %cond = arith.cmpi "slt", %iv, %ub
   /// scf.if %cond {
   ///   ...
   /// }
   CyclicNumProcsGeNumIters = 1,

   /// Cyclic distribution where the number of processors can be assumed to be
   ///  equal to the number of iterations of the distributed loop. In such cases,
   ///  no bounds check is needed. Distributes the following loop
   ///
   /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
   ///
   /// to
   ///
   /// %iv = %lb + %procId * %step
   CyclicNumProcsEqNumIters = 2
 };

 /// Callback function type used to get processor ID, and number of processors
 /// used for distribution for all parallel loops generated.
 struct ProcInfo {
   Value procId;
   Value nprocs;
 };
 using ProcInfoCallBackFn = std::function<SmallVector<ProcInfo, 2>(
     OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges)>;
 using OneDimProcInfoCallBackFn =
     std::function<ProcInfo(OpBuilder &b, Location loc)>;

 /// Options that allow distribution of loops generated in Linalg transforms to
 /// processors while generating the loops.
 struct LinalgLoopDistributionOptions {
   /// Callback function that returns the Values for processor ID (`procId`), and
   /// number of processors (`nprocs`) used to execute the parallel loops. The
   /// number of `{procId, nprocs}` pairs returned must be equal to the number of
   /// `parallelLoopRanges` passed into the callback, which in-turn is same as
   /// the number of parallel loops for which the `distributionMethod` is
   /// specified below.
   ProcInfoCallBackFn procInfo;
   /// Specification of how to distribute the `scf.parallel` loops that are
   /// generated. As the `scf.parallel` loop is generated, the elements of this
   /// vector is used (from left to right) and the specified distribution is
   /// applied. If the vector is less than the number of `scf.parallel` loops
   /// generated, then no distribution is applied.
   SmallVector<DistributionMethod, 0> distributionMethod = {};

   /// The map keyed by the distribution type that contains callback functions
   /// that return the Values for processor ID (`procId`), and number of
   /// processors (`nprocs`) used to execute the parallel loops.
   DenseMap<StringRef, OneDimProcInfoCallBackFn> procInfoMap;
 };

 /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`.
 void updateBoundsForCyclicDistribution(OpBuilder &builder, Location loc,
                                        Value procId, Value nprocs, Value &lb,
                                        Value &ub, Value &step);

 //===----------------------------------------------------------------------===//
 // Generic op region utilities
 //===----------------------------------------------------------------------===//

 /// A struct containing common matchers over linalg op's region.
 struct RegionMatcher {
   enum class BinaryOpKind {
     IAdd,
   };

   /// Matches the given linalg op if its body is performing binary operation on
   /// int or float scalar values and returns the binary op kind.
   ///
   /// The linalg op's region is expected to be
   /// ```
   /// {
   ///   ^bb(%a: <scalar-type>, %b: <scalar-type>):
   ///     %0 = <binary-op> %a, %b: <scalar-type>
   ///     linalg.yield %0: <scalar-type>
   /// }
   /// ```
   static Optional<BinaryOpKind> matchAsScalarBinaryOp(GenericOp op);
 };

 //===----------------------------------------------------------------------===//
 // Loop nest utilities
 //===----------------------------------------------------------------------===//

 /// Utility class used to generate nested loops with ranges described by
 /// `loopRanges` and loop type described by the `iteratorTypes`. `bodyBuilderFn`
 /// is used to generate the body of the innermost loop. It is passed a range
 /// of loop induction variables and a range of operand values to use.
 template <typename LoopTy>
 struct GenerateLoopNest {
   static void doit(OpBuilder &b, Location loc, ArrayRef<Range> loopRanges,
                    LinalgOp linalgOp, ArrayRef<Attribute> iteratorTypes,
                    function_ref<scf::ValueVector(OpBuilder &, Location,
                                                  ValueRange, ValueRange)>
                        bodyBuilderFn,
                    Optional<LinalgLoopDistributionOptions> = None,
                    ArrayRef<StringRef> distributionTypes = {});
 };

 } // namespace linalg
 } // namespace mlir

 #endif // MLIR_DIALECT_LINALG_UTILS_H_
	//===- Utils.h - Utilities to support the Linalg dialect --------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef MLIR_DIALECT_LINALG_UTILS_H_
	#define MLIR_DIALECT_LINALG_UTILS_H_

	#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
	#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
	#include "mlir/Dialect/SCF/SCF.h"
	#include "mlir/Dialect/StandardOps/IR/Ops.h"
	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/SetVector.h"

	namespace mlir {
	class AffineExpr;
	class AffineForOp;
	class AffineMap;
	class PatternRewriter;

	namespace linalg {
	class LinalgDependenceGraph;

	//===----------------------------------------------------------------------===//
	// General utilities
	//===----------------------------------------------------------------------===//

	/// Check if `permutation` is a permutation of the range
	/// `[0, permutation.size())`.
	bool isPermutation(ArrayRef<int64_t> permutation);

	/// Apply the permutation defined by `permutation` to `inVec`.
	/// Element `i` in `inVec` is mapped to location `j = permutation[i]`.
	/// E.g.: for an input vector `inVec = ['a', 'b', 'c']` and a permutation vector
	/// `permutation = [2, 0, 1]`, this function leaves `inVec = ['c', 'a', 'b']`.
	template <typename T, unsigned N>
	void applyPermutationToVector(SmallVector<T, N> &inVec,
	ArrayRef<int64_t> permutation) {
	SmallVector<T, N> auxVec(inVec.size());
	for (auto en : enumerate(permutation))
	auxVec[en.index()] = inVec[en.value()];
	inVec = auxVec;
	}

	/// Helper function that creates a memref::DimOp or tensor::DimOp depending on
	/// the type of `source`.
	Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source, int64_t dim);

	/// Given an operation, retrieves the value of each dynamic dimension through
	/// constructing the necessary DimOp operators.
	SmallVector<Value, 4> getDynOperands(Location loc, Value val, OpBuilder &b);

	/// Computes an upper bound for the result `value` of an index computation.
	/// Translates AffineMinOps and AffineApplyOps along the use-def chains of the
	/// index computation to affine constraints and projects out intermediate
	/// values. The method sets `boundMap` to an affine map that given
	/// `boundOperands` evaluates to an upper bound for the index computation.
	///
	/// Example:
	/// ```
	/// %dim0 = dim %tensor, %c0
	/// %dim1 = dim %tensor, %c1
	/// %0 = affine.min affine.map<(d0) -> (40, d0)> (%dim0)
	/// %1 = affine.apply affine.map<(d0, d1) -> (d0 + d1)> (%0, %dim1)
	/// ```
	/// getUpperBoundForIndex(%1, boundMap, boundOperands)
	/// set the output parameters to:
	/// - boundMap = affine.map<(d0) -> (d0 + 40)>
	/// - boundOperands = [%dim1]
	void getUpperBoundForIndex(Value value, AffineMap &boundMap,
	SmallVectorImpl<Value> &boundOperands);

	/// Returns a constant upper bound for the result `value` of an index
	/// computation. Calls `getUpperBoundForIndex` and returns a constant upper
	/// bound if the result of `boundMap` is a constant expression and failure
	/// otherwise.
	///
	/// Example:
	/// ```
	/// %0 = affine.min affine.map<(d0) -> (40, d0)> (%d0)
	/// %1 = affine.apply affine.map<(d0) -> (d0 + 2)> (%0)
	/// ```
	/// getConstantUpperBoundForIndex(%1) returns 42
	/// (boundsMap = affine.map<() -> (42)>)
	FailureOr<int64_t> getConstantUpperBoundForIndex(Value value);

	/// Create an ExtractSliceOp and, if `source` is defined by an ExtractSliceOp,
	/// fold it by adding the offsets.
	///
	/// Example:
	/// ```
	/// %0 = tensor.extract_slice %arg0[3, 4][3, 32][1, 1] : tensor<64x64xf32> to
	/// tensor<3x32xf32>
	/// %1 = tensor.extract_slice %0[0, 5][3, 4][1, 1] : tensor<3x32xf32> to
	/// tensor<3x4xf32>
	/// ```
	/// folds into:
	/// ```
	/// %1 = tensor.extract_slice %arg0[3, 9][3, 4][1, 1] : tensor<64x64xf32> to
	/// tensor<3x4xf32>
	/// ```
	tensor::ExtractSliceOp makeComposedExtractSliceOp(
	OpBuilder &b, Location loc, Value source, ArrayRef<OpFoldResult> offsets,
	ArrayRef<OpFoldResult> sizes, ArrayRef<OpFoldResult> strides);

	/// Create a PadTensorOp that pads `source` to the size of the statically sized
	/// `type` whose static sizes are assumed to be greater than the dynamic
	/// `source` size. The padding introduces trailing `pad` values until the target
	/// size is met. If `source` is defined by one or more LinalgOps that have been
	/// padded with the same value and sizes, return their padded result instead of
	/// creating a PadTensorOp.
	///
	/// Example:
	/// ```
	/// %0 = tensor.extract_slice %arg0 [%iv0, %iv1] [%sz0, %sz1]
	/// %1 = linalg.pad_tensor %0 low[0, 0] high[...] { linalg.yield %cst }
	/// %2 = linalg.matmul ins(...) outs(%1)
	/// %3 = tensor.extract_slice %2 [0, 0] [%sz0, %sz1]
	/// ```
	/// makeComposedPadHighOp(source=%3, pad=%cst) returns %2
	/// makeComposedPadHighOp(source=%3, pad=%other_cst) returns %4
	/// ```
	/// %4 = linalg.pad_tensor %3 low[0, 0] high[...] { linalg.yield %other_cst }
	/// ```
	Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
	Value source, Value pad, bool nofold);

	//===----------------------------------------------------------------------===//
	// Fusion / Tiling utilities
	//===----------------------------------------------------------------------===//

	/// The type of loops to be generated during tiling.
	enum class LinalgTilingLoopType {
	Loops = 0,
	AffineLoops = 1,
	ParallelLoops = 2,
	TiledLoops = 3,
	};

	/// Checks whether the specific `producer` is the last write to exactly the
	/// whole `consumedView`. This checks structural dominance, that the dependence
	/// is a RAW without any interleaved write to any piece of `consumedView`.
	bool isProducerLastWriteOfView(const LinalgDependenceGraph &graph,
	LinalgOp consumer, Value consumedView,
	LinalgOp producer);

	/// Checks whether fusing the specific `producer` of the `consumedView` is
	/// feasible. This checks `producer` is the last write of `consumedView` and
	/// that no interleaved dependence would be violated (RAW, WAR or WAW).
	bool isFusableInto(const LinalgDependenceGraph &graph, LinalgOp consumer,
	Value consumedView, LinalgOp producer);

	/// Compute tile offsets, given a list of loop `ivs` and `tileSizes`. In case a
	/// tile size is zero (i.e., no tiling), the corresponding offset is also zero.
	SmallVector<Value> computeTileOffsets(OpBuilder &b, Location loc,
	ValueRange ivs, ValueRange tileSizes);

	/// Compute tile sizes, given a list of loop `ivs`, `tileSizes` and dimension
	/// sizes (`sizeBounds`). In case a tile size is zero (i.e., no tiling), the
	/// corresponding result size is the corresponding value from `sizeBounds`.
	/// Note: The returned tile sizes are closed intervals.
	SmallVector<Value> computeTileSizes(OpBuilder &b, Location loc, ValueRange ivs,
	ValueRange tileSizes,
	ArrayRef<Value> sizeBounds);

	/// Creates an extract_slice/subview op for a single `valueToTile` with
	/// `builder`. This new operation extracts a tile of `valueToTile`, starting
	/// at offsets `lbs` and with sizes `subShapeSizes`.
	Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
	ValueRange tileSizes, AffineMap map, ValueRange lbs,
	ValueRange ubs, ValueRange subShapeSizes);

	/// Creates extract_slice/subview ops for all `valuesToTile` of the given
	/// `linalgOp` with `builder`, assuming `linalgOp` is being fused into a loop
	/// nest for tiling with the given induction variables `ivs` and tile sizes
	/// `tileSizes`. `sizeBounds` are the iteration space bounds for all the
	/// implicit loops in `linalgOp`.
	///
	/// Note that a constant zero in `tileSizes` means no tiling at that implicit
	/// loop. The number of non-zero values in `tileSizes` should be equal to the
	/// number of values in `ivs`.
	SmallVector<Value, 4> makeTiledShapes(OpBuilder &builder, Location loc,
	LinalgOp linalgOp,
	ArrayRef<Value> valuesToTile,
	ValueRange ivs, ValueRange tileSizes,
	ArrayRef<Value> sizeBounds);

	/// Add the tile loop induction variables `ivs` to the IndexOp results found in
	/// the body of the `tiledOp` to account for the tile offset.
	void addTileLoopIvsToIndexOpResults(OpBuilder &b, LinalgOp tiledOp,
	ArrayRef<Value> ivs);

	using FusableOpDependencesTy = llvm::MapVector<
	Operation *,
	SmallVector<LinalgDependenceGraph::LinalgDependenceGraphElem, 1>>;
	FusableOpDependencesTy
	findAllFusableDependences(ArrayRef<LinalgOp> ops,
	const LinalgDependenceGraph &dependenceGraph);

	/// A struct containing the Linalg producer before and after fusion.
	/// When operating on tensors, `fusedProducer` may feed into a `tensor.cast` op
	/// before the consumer Linalg op, until enough canonicalizations have applied.
	struct FusionInfo {
	LinalgOp originalProducer;
	LinalgOp fusedProducer;
	};

	/// Fuses producer into consumer if the producer is structurally feasible and
	/// the fusion would not violate dependencies.
	/// Implements the fusion part of the "tileAndFuse on buffers" transformation
	/// and thus requires the `consumerOpOperand` to be a `subview` op (generally
	/// obtained by applying the tiling transformation).
	FailureOr<FusionInfo> fuseProducerOfBuffer(OpBuilder &b,
	OpOperand &consumerOpOperand,
	const LinalgDependenceGraph &graph);
	/// Tensor counterpart of `fuseProducerOfBuffer`.
	/// This implements the fusion part of the "tileAndFuse on tensors"
	/// transformation and thus requires the `consumerOpOperand` to be a
	/// `extract_slice` op (generally obtained by applying the tiling
	/// transformation).
	FailureOr<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
	OpOperand &consumerOpOperand);
	/// Tensor counterpart of `fuseProducerOfBuffer`.
	/// This implements the fusion part of the "tileAndFuse on tensors"
	/// transformation and thus requires the `consumerOpOperand` to be a
	/// `extract_slice` op (generally obtained by applying the tiling
	/// transformation). Assumes `producerOfTensor` is a Linalg op that produces
	/// `consumerOpOperand`.
	FailureOr<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
	OpResult producerOpResult,
	OpOperand &consumerOpOperand);

	//===----------------------------------------------------------------------===//
	// Fusion on tensor utilities
	//===----------------------------------------------------------------------===//

	/// A struct to manage the tile loop nest specific information.
	class TileLoopNest {
	public:
	TileLoopNest(LinalgOp rootOp) : rootOp(rootOp) {}

	/// Tile the root operation using the given `tileSizes` and `tileInterchange`.
	LogicalResult tileRootOp(OpBuilder &b, ArrayRef<int64_t> tileSizes,
	ArrayRef<int64_t> tileInterchange);

	/// Fuse the producer of `consumerOpOperand` into the tile loop nest. Returns
	/// the fused producer or fails if fusion is not possible.
	FailureOr<LinalgOp> fuseProducer(OpBuilder &b, OpOperand *consumerOpOperand);

	/// Returns the replacement results for the original untiled root operation.
	ValueRange getRootOpReplacementResults();

	/// Returns the tiled root operation.
	LinalgOp getRootOp() { return rootOp; }

	/// Returns the tiled root operation and the fused producers.
	SmallVector<LinalgOp> getAllTiledAndFusedOps();

	/// Returns the loop ops generated from tiling.
	ArrayRef<scf::ForOp> getLoopOps() { return tileLoopOps; }

	private:
	/// Returns true if the tile loop nest has no tile loops.
	bool isEmpty();

	/// Returns true if the tile loop nest invariants are satisfied:
	/// - The `rootOp` has been tiled at least once.
	/// - The number of tile loop operations and dimensions match.
	/// - The innermost tile loop is the parent of `tiledOp`.
	/// - The tile loops are directly nested.
	// TODO: relax to support additional control flow, e.g., IfOp.
	bool isValid();

	/// Searches the block arguments tied to a block argument `bbArg` of the
	/// innermost tile loop. Returns the block argument from outermost to
	/// innermost or an empty vector if none are found.
	SmallVector<BlockArgument> getTiedBBArgs(BlockArgument bbArg);

	/// Returns the iteration argument of the outermost tile loop mapped to a
	/// block argument `bbArg` of the innermost tile loop.
	OpOperand *getTiedIterArg(BlockArgument bbArg);

	/// Returns true if `bbArg` has other used than `sliceOp` and its
	/// dependencies. Only if there are no other uses, the producer output
	/// iteration argument may reused to pass the producer result after fusion.
	bool hasOtherUses(BlockArgument bbArg, tensor::ExtractSliceOp sliceOp);

	LinalgOp rootOp;
	SmallVector<scf::ForOp> tileLoopOps;
	DenseMap<Operation *, SmallVector<int64_t>> tiledRootAndFusedOpsLoops;
	};

	/// Tiles `consumerOp` and fuses its dependencies if possible. Uses the
	/// `tileSizes` and `tileInterchange` parameters to control the tiling.
	FailureOr<TileLoopNest>
	tileConsumerAndFuseProducers(OpBuilder &b, LinalgOp consumerOp,
	ArrayRef<int64_t> tileSizes,
	ArrayRef<int64_t> tileInterchange);

	//===----------------------------------------------------------------------===//
	// Distribution utilities
	//===----------------------------------------------------------------------===//

	/// Scheme used to distribute loops to processors.
	enum class DistributionMethod {
	/// Cyclic distribution where no assumption is made about the dynamic
	/// relationship between number of processors and number of iterations of the
	/// distributed loop. Distributes the following loop
	///
	/// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
	///
	/// to
	///
	/// scf.parallel(%iv)= (%lb + %procId * %step) to (%ub) step (%step * %nprocs)
	Cyclic = 0,

	/// Cyclic distribution where the number of processors can be assumed to be
	/// more than or equal to the number of iterations of the distributed loop. In
	/// such cases, a simple in-bounds check is enough (instead of materializing a
	/// loop). Distributes the following loop
	///
	/// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
	///
	/// to
	///
	/// %iv = %lb + %procId * %step
	/// %cond = arith.cmpi "slt", %iv, %ub
	/// scf.if %cond {
	/// ...
	/// }
	CyclicNumProcsGeNumIters = 1,

	/// Cyclic distribution where the number of processors can be assumed to be
	/// equal to the number of iterations of the distributed loop. In such cases,
	/// no bounds check is needed. Distributes the following loop
	///
	/// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
	///
	/// to
	///
	/// %iv = %lb + %procId * %step
	CyclicNumProcsEqNumIters = 2
	};

	/// Callback function type used to get processor ID, and number of processors
	/// used for distribution for all parallel loops generated.
	struct ProcInfo {
	Value procId;
	Value nprocs;
	};
	using ProcInfoCallBackFn = std::function<SmallVector<ProcInfo, 2>(
	OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges)>;
	using OneDimProcInfoCallBackFn =
	std::function<ProcInfo(OpBuilder &b, Location loc)>;

	/// Options that allow distribution of loops generated in Linalg transforms to
	/// processors while generating the loops.
	struct LinalgLoopDistributionOptions {
	/// Callback function that returns the Values for processor ID (`procId`), and
	/// number of processors (`nprocs`) used to execute the parallel loops. The
	/// number of `{procId, nprocs}` pairs returned must be equal to the number of
	/// `parallelLoopRanges` passed into the callback, which in-turn is same as
	/// the number of parallel loops for which the `distributionMethod` is
	/// specified below.
	ProcInfoCallBackFn procInfo;
	/// Specification of how to distribute the `scf.parallel` loops that are
	/// generated. As the `scf.parallel` loop is generated, the elements of this
	/// vector is used (from left to right) and the specified distribution is
	/// applied. If the vector is less than the number of `scf.parallel` loops
	/// generated, then no distribution is applied.
	SmallVector<DistributionMethod, 0> distributionMethod = {};

	/// The map keyed by the distribution type that contains callback functions
	/// that return the Values for processor ID (`procId`), and number of
	/// processors (`nprocs`) used to execute the parallel loops.
	DenseMap<StringRef, OneDimProcInfoCallBackFn> procInfoMap;
	};

	/// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`.
	void updateBoundsForCyclicDistribution(OpBuilder &builder, Location loc,
	Value procId, Value nprocs, Value &lb,
	Value &ub, Value &step);

	//===----------------------------------------------------------------------===//
	// Generic op region utilities
	//===----------------------------------------------------------------------===//

	/// A struct containing common matchers over linalg op's region.
	struct RegionMatcher {
	enum class BinaryOpKind {
	IAdd,
	};

	/// Matches the given linalg op if its body is performing binary operation on
	/// int or float scalar values and returns the binary op kind.
	///
	/// The linalg op's region is expected to be
	/// ```
	/// {
	/// ^bb(%a: <scalar-type>, %b: <scalar-type>):
	/// %0 = <binary-op> %a, %b: <scalar-type>
	/// linalg.yield %0: <scalar-type>
	/// }
	/// ```
	static Optional<BinaryOpKind> matchAsScalarBinaryOp(GenericOp op);
	};

	//===----------------------------------------------------------------------===//
	// Loop nest utilities
	//===----------------------------------------------------------------------===//

	/// Utility class used to generate nested loops with ranges described by
	/// `loopRanges` and loop type described by the `iteratorTypes`. `bodyBuilderFn`
	/// is used to generate the body of the innermost loop. It is passed a range
	/// of loop induction variables and a range of operand values to use.
	template <typename LoopTy>
	struct GenerateLoopNest {
	static void doit(OpBuilder &b, Location loc, ArrayRef<Range> loopRanges,
	LinalgOp linalgOp, ArrayRef<Attribute> iteratorTypes,
	function_ref<scf::ValueVector(OpBuilder &, Location,
	ValueRange, ValueRange)>
	bodyBuilderFn,
	Optional<LinalgLoopDistributionOptions> = None,
	ArrayRef<StringRef> distributionTypes = {});
	};

	} // namespace linalg
	} // namespace mlir

	#endif // MLIR_DIALECT_LINALG_UTILS_H_