| //===- Transforms.h - Linalg transformations as patterns --------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H |
| #define MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H |
| |
| #include <utility> |
| |
| #include "mlir/Conversion/VectorToSCF/VectorToSCF.h" |
| #include "mlir/Dialect/Bufferization/IR/Bufferization.h" |
| #include "mlir/Dialect/Linalg/Utils/Utils.h" |
| #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| #include "mlir/Dialect/SCF/Utils/Utils.h" |
| #include "mlir/Dialect/Tensor/IR/Tensor.h" |
| #include "mlir/Dialect/Utils/StaticValueUtils.h" |
| #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" |
| #include "mlir/Dialect/X86Vector/Transforms.h" |
| #include "mlir/IR/PatternMatch.h" |
| #include "mlir/Interfaces/TilingInterface.h" |
| #include "mlir/Support/LogicalResult.h" |
| #include "mlir/Transforms/DialectConversion.h" |
| #include "llvm/ADT/SmallBitVector.h" |
| #include "llvm/ADT/SmallSet.h" |
| |
| namespace mlir { |
| namespace bufferization { |
| class AllocTensorOp; |
| class OneShotAnalysisState; |
| } // namespace bufferization |
| |
| namespace linalg { |
| |
| class LinalgOp; |
| |
| //===----------------------------------------------------------------------===// |
| // Utils. |
| //===----------------------------------------------------------------------===// |
| |
| /// Return vector::CombiningKind for the given op. |
| std::optional<vector::CombiningKind> getCombinerOpKind(Operation *combinerOp); |
| |
| //===----------------------------------------------------------------------===// |
| // Bufferization-related transforms. |
| //===----------------------------------------------------------------------===// |
| |
| struct BufferizeToAllocationOptions { |
| enum class AllocOp { MemrefAlloc = 0, MemrefAlloca = 1 }; |
| AllocOp allocOp = AllocOp::MemrefAlloc; |
| |
| enum class MemcpyOp { |
| MaterializeInDestination = 0, |
| MemrefCopy = 1, |
| LinalgCopy = 2 |
| }; |
| MemcpyOp memcpyOp = MemcpyOp::MaterializeInDestination; |
| |
| /// If set to "true", only the destination tensor operands are bufferized to |
| /// a new allocation (and wrapped in "bufferization.to_tensor"), but not the |
| /// targeted op itself. |
| bool bufferizeDestinationOnly = false; |
| |
| /// If set to "true", a `memref.dealloc` operation will be emitted for each |
| /// allocated buffer. Otherwise, the memory is leaked, which is useful if |
| /// the buffer deallocation pipeline should be run after bufferization is |
| /// done. |
| bool emitDealloc = false; |
| }; |
| |
| /// Materialize a buffer allocation for the given tensor.pad op and lower the |
| /// op to linalg.fill/linalg.generic + bufferization.materialize_in_destination. |
| /// E.g.: |
| /// |
| /// %0 = tensor.pad low[%l] high[%h] %t ... |
| /// |
| /// is lowered to: |
| /// |
| /// %alloc = memref.alloc |
| /// linalg.fill ... outs(%alloc) |
| /// %subview = memref.subview %alloc [%l] [...] [1] |
| /// bufferization.materialize_in_destination %t in %subview |
| /// %0 = bufferization.to_tensor %alloc restrict writable |
| /// |
| /// In addition to rewriting the IR as shown above, this function returns the |
| /// newly allocated buffer. The `insertionPoint` parameter can be used to |
| /// specify a custom insertion point for the buffer allocation. |
| Value bufferizeToAllocation(RewriterBase &rewriter, |
| const BufferizeToAllocationOptions &options, |
| tensor::PadOp padOp, Attribute memorySpace = {}, |
| Operation *insertionPoint = nullptr); |
| |
| /// Materialize a buffer allocation for the given vector.mask op and bufferize |
| /// the op, including its region. E.g.: |
| /// |
| /// %0 = vector.mask { |
| /// vector.transfer_write %v, %t : vector<16xf32>, tensor<?xf32> |
| /// } : vector<16xi1> -> tensor<?xf32> |
| /// |
| /// is lowered to: |
| /// |
| /// %alloc = memref.alloc |
| /// bufferization.materialize_in_destination %t in %subview |
| /// vector.mask { |
| /// vector.transfer_write %arg0, %alloc : vector<16xf32>, memref<?xf32> |
| /// } : vector<16xi1> |
| /// %0 = bufferization.to_tensor %alloc restrict writable |
| /// |
| /// In addition to rewriting the IR as shown above, this function returns the |
| /// newly allocated buffer. The `insertionPoint` parameter can be used to |
| /// specify a custom insertion point for the buffer allocation. |
| Value bufferizeToAllocation(RewriterBase &rewriter, |
| const BufferizeToAllocationOptions &options, |
| vector::MaskOp maskOp, Attribute memorySpace = {}, |
| Operation *insertionPoint = nullptr); |
| |
| /// Materialize a buffer allocation for the given bufferization.alloc_tensor op |
| /// and lower the op to memref.alloc + memref.tensor_store. |
| /// |
| /// In addition to rewriting the IR, this function returns the newly allocated |
| /// buffer. The `insertionPoint` parameter can be used to specify a custom |
| /// insertion point for the buffer allocation. |
| Value bufferizeToAllocation(RewriterBase &rewriter, |
| const BufferizeToAllocationOptions &options, |
| bufferization::AllocTensorOp allocTensorOp, |
| Attribute memorySpace = {}, |
| Operation *insertionPoint = nullptr); |
| |
| /// Bufferize the given op with tensor semantics and materialize the result in |
| /// a newly allocated buffer. |
| /// |
| /// Only bufferizable ops that bufferize to a memory write or have an |
| /// aliasing OpOperand (and do not themselves bufferize to an allocation) are |
| /// supported. They are bufferized using their BufferizableOpInterface |
| /// implementation. |
| /// |
| /// Selected ops that bufferize to an allocation (or need special handling) are |
| /// also supported: |
| /// - tensor.pad |
| /// - vector.mask |
| /// |
| /// This function returns the newly allocated buffer. The `insertionPoint` |
| /// parameter can be used to specify a custom insertion point for the buffer |
| /// allocation. |
| Value bufferizeToAllocation(RewriterBase &rewriter, |
| const BufferizeToAllocationOptions &options, |
| Operation *op, Attribute memorySpace = {}, |
| Operation *insertionPoint = nullptr); |
| |
| /// Try to eliminate tensor::EmptyOps inside `op` that are anchored on a |
| /// LinalgOp. This transforms looks for LinalgOps that have an unused output |
| /// operand and an input operand that is rooted in a tensor::EmptyOp. The |
| /// tensor::EmptyOp uses are replaced with the output operand and the two |
| /// operands of the LinalgOp are swapped. |
| /// |
| /// Example: |
| /// %0 = tensor.empty() |
| /// %1 = linalg.matmul ins(...) outs(%0) |
| /// %2 = linalg.generic ins(%1) outs(%dest) { |
| /// ^bb0(%in: f32, %out: f32): |
| /// // out not used |
| /// } |
| /// |
| /// The IR is transformed as follows: |
| /// %0 = tensor.empty() |
| /// %1 = linalg.matmul ins(...) outs(%dest) |
| /// %2 = linalg.generic ins(%0) outs(%1) { |
| /// ^bb0(%in: f32, %out: f32): |
| /// // Use %out instead of %in |
| /// } |
| /// |
| /// The "ins" operand has no uses inside the body of the LinalgOp and can be |
| /// folded away with existing cleanup patterns. Afterwards, the tensor::EmptyOp |
| /// can also fold away. |
| LogicalResult linalgOpAnchoredEmptyTensorEliminationStep( |
| RewriterBase &rewriter, Operation *op, |
| bufferization::OneShotAnalysisState &state); |
| |
| //===----------------------------------------------------------------------===// |
| // Structs that configure the behavior of various transformations. |
| //===----------------------------------------------------------------------===// |
| |
| using TileSizeComputationFunction = |
| std::function<SmallVector<Value, 4>(OpBuilder &, Operation *)>; |
| |
| struct LinalgTilingOptions { |
| /// Computation function that returns the tile sizes for each operation. |
| /// Delayed construction of constant tile sizes should occur to interoperate |
| /// with folding. |
| TileSizeComputationFunction tileSizeComputationFunction = nullptr; |
| |
| LinalgTilingOptions & |
| setTileSizeComputationFunction(TileSizeComputationFunction fun) { |
| tileSizeComputationFunction = std::move(fun); |
| return *this; |
| } |
| /// Set the `tileSizeComputationFunction` to return the values `ts`. The |
| /// values must not fold away when tiling. Otherwise, use a more robust |
| /// `tileSizeComputationFunction`. |
| LinalgTilingOptions &setTileSizes(const SmallVector<Value, 4> &ts) { |
| tileSizeComputationFunction = [=](OpBuilder &, Operation *) { return ts; }; |
| return *this; |
| } |
| /// Convenience function to set the `tileSizeComputationFunction` to a |
| /// function that computes tile sizes at the point they are needed. Allows |
| /// proper interaction with folding. |
| LinalgTilingOptions &setTileSizes(ArrayRef<int64_t> ts); |
| |
| /// Tile all dynamic dimensions by 1. I.e., scalarize those dimensions. |
| /// Note: `scalarizeDynamicDims` and `setTileSizes` cannot be used together. |
| LinalgTilingOptions &scalarizeDynamicDims(); |
| |
| /// The interchange vector to reorder the tiled loops. |
| SmallVector<unsigned, 4> interchangeVector = {}; |
| |
| LinalgTilingOptions &setInterchange(ArrayRef<unsigned> interchange) { |
| interchangeVector.assign(interchange.begin(), interchange.end()); |
| return *this; |
| } |
| |
| /// The type of tile loops to generate. |
| LinalgTilingLoopType loopType = LinalgTilingLoopType::Loops; |
| |
| LinalgTilingOptions &setLoopType(LinalgTilingLoopType lt) { |
| loopType = lt; |
| return *this; |
| } |
| |
| /// When specified, specifies distribution of generated tile loops to |
| /// processors. |
| std::optional<LinalgLoopDistributionOptions> distribution; |
| |
| LinalgTilingOptions & |
| setDistributionOptions(LinalgLoopDistributionOptions distributionOptions) { |
| distribution = std::move(distributionOptions); |
| return *this; |
| } |
| |
| /// Specification markers of how to distribute the `linalg.tiled_loop`. |
| SmallVector<StringRef, 2> distributionTypes = {}; |
| |
| LinalgTilingOptions &setDistributionTypes(ArrayRef<StringRef> types) { |
| distributionTypes.assign(types.begin(), types.end()); |
| return *this; |
| } |
| |
| /// Peel the specified loops. |
| SmallVector<int64_t> peeledLoops; |
| |
| LinalgTilingOptions &setPeeledLoops(ArrayRef<int64_t> loops) { |
| peeledLoops.clear(); |
| peeledLoops.append(loops.begin(), loops.end()); |
| return *this; |
| } |
| }; |
| |
| struct LinalgTilingAndFusionOptions { |
| /// Tile sizes used to tile the root operation. |
| SmallVector<int64_t> tileSizes; |
| LinalgTilingAndFusionOptions &setTileSizes(ArrayRef<int64_t> ts) { |
| tileSizes.assign(ts.begin(), ts.end()); |
| return *this; |
| } |
| /// Tile interchange used to permute the tile loops. |
| SmallVector<int64_t> tileInterchange; |
| /// When specified, specifies distribution of generated tile loops to |
| /// processors. |
| std::optional<LinalgLoopDistributionOptions> tileDistribution; |
| LinalgTilingAndFusionOptions & |
| setDistributionOptions(LinalgLoopDistributionOptions distributionOptions) { |
| tileDistribution = std::move(distributionOptions); |
| return *this; |
| } |
| }; |
| |
| struct LinalgPaddingOptions { |
| /// A padding value for every operand. |
| SmallVector<Attribute> paddingValues; |
| LinalgPaddingOptions &setPaddingValues(ArrayRef<Attribute> pv) { |
| paddingValues.assign(pv.begin(), pv.end()); |
| return *this; |
| } |
| /// A list of iterator dimensions to pad. |
| SmallVector<int64_t> paddingDimensions; |
| LinalgPaddingOptions &setPaddingDimensions(ArrayRef<int64_t> pd) { |
| paddingDimensions.assign(pd.begin(), pd.end()); |
| return *this; |
| } |
| /// A list of multiples to which each padding dimension should be padded to. |
| std::optional<SmallVector<int64_t>> padToMultipleOf; |
| LinalgPaddingOptions &setPadToMultipleOf(ArrayRef<int64_t> m) { |
| padToMultipleOf.emplace(m.begin(), m.end()); |
| return *this; |
| } |
| /// A flag for every operand to mark the PadOp as nofold which enables |
| /// packing for statically shaped operands. |
| SmallVector<bool> packPaddings; |
| LinalgPaddingOptions &setPackPaddings(ArrayRef<bool> pp) { |
| packPaddings.assign(pp.begin(), pp.end()); |
| return *this; |
| } |
| /// A number of loops to hoist the PadOp out for every operand. |
| SmallVector<int64_t> hoistPaddings; |
| LinalgPaddingOptions &setHoistPaddings(ArrayRef<int64_t> hp) { |
| hoistPaddings.assign(hp.begin(), hp.end()); |
| return *this; |
| } |
| /// A permutation vector for every operand used to transpose the packed |
| /// PadOp results. |
| SmallVector<SmallVector<int64_t>> transposePaddings; |
| LinalgPaddingOptions & |
| setTransposePaddings(ArrayRef<SmallVector<int64_t>> tp) { |
| transposePaddings.assign(tp.begin(), tp.end()); |
| return *this; |
| } |
| enum class CopyBackOp : int8_t { |
| None = 0, |
| BufferizationMaterializeInDestination = 1, |
| LinalgCopy = 2 |
| }; |
| /// The op to be used for copying the padded result to the original |
| /// destination tensor. |
| CopyBackOp copyBackOp = CopyBackOp::BufferizationMaterializeInDestination; |
| LinalgPaddingOptions &setCopyBackOp(CopyBackOp op) { |
| copyBackOp = op; |
| return *this; |
| } |
| }; |
| |
| /// Callback function type used to perform the allocation for the promoted |
| /// `subView`. In `boundingSubViewsize` a best attempt is made to find the |
| /// smallest constant value for the size of the buffer needed for each |
| /// dimension. If that is not possible, contains the dynamic size of the |
| /// subview. The call back should return the buffer to use. |
| using AllocBufferCallbackFn = std::function<std::optional<Value>( |
| OpBuilder &b, memref::SubViewOp subView, |
| ArrayRef<Value> boundingSubViewSize, DataLayout &layout)>; |
| |
| /// Callback function type used to deallocate the buffers used to hold the |
| /// promoted subview. |
| using DeallocBufferCallbackFn = |
| std::function<LogicalResult(OpBuilder &b, Value buffer)>; |
| |
| /// Callback function type used to insert copy from original subview to |
| /// subview of the promoted region for the read operands/subview of promoted |
| /// region to original subview for the results. The copy has to happen from |
| /// `src` to `dst`. |
| using CopyCallbackFn = |
| std::function<LogicalResult(OpBuilder &b, Value src, Value dst)>; |
| |
| struct LinalgPromotionOptions { |
| /// Indices of subViews to promote. If `std::nullopt`, try to promote all |
| /// operands. |
| std::optional<DenseSet<unsigned>> operandsToPromote; |
| LinalgPromotionOptions &setOperandsToPromote(ArrayRef<int64_t> operands) { |
| operandsToPromote = DenseSet<unsigned>(); |
| operandsToPromote->insert(operands.begin(), operands.end()); |
| return *this; |
| } |
| /// If ith element of `useFullTiles` is true the full view should be used |
| /// for the promoted buffer of the ith operand in `operandsToPromote`. |
| /// Otherwise the partial view will be used. The decision is defaulted to |
| /// `useFullTileBuffersDefault` when `useFullTileBuffers` is std::nullopt and |
| /// for operands missing from `useFullTileBuffers`. |
| std::optional<llvm::SmallBitVector> useFullTileBuffers; |
| LinalgPromotionOptions &setUseFullTileBuffers(ArrayRef<bool> useFullTiles) { |
| unsigned size = useFullTiles.size(); |
| llvm::SmallBitVector tmp(size, false); |
| for (unsigned i = 0; i < size; ++i) |
| tmp[i] = useFullTiles[i]; |
| useFullTileBuffers = tmp; |
| return *this; |
| } |
| /// If true all operands unspecified by `useFullTileBuffers` will use the |
| /// full view, otherwise the partial view. |
| bool useFullTileBuffersDefault = false; |
| LinalgPromotionOptions &setUseFullTileBuffersByDefault(bool use) { |
| useFullTileBuffersDefault = use; |
| return *this; |
| } |
| /// Alignment of promoted buffer. If `std::nullopt` do not specify alignment. |
| std::optional<unsigned> alignment; |
| LinalgPromotionOptions &setAlignment(unsigned align) { |
| alignment = align; |
| return *this; |
| } |
| /// Memory space of promoted buffer. If `std::nullopt` do not specify memory |
| /// space. |
| std::optional<Attribute> memorySpace; |
| LinalgPromotionOptions &setMemorySpace(Attribute memorySpc) { |
| memorySpace = memorySpc; |
| return *this; |
| } |
| /// Use alloca with the default allocation scheme. |
| bool useAlloca = false; |
| LinalgPromotionOptions &setUseAlloca(bool use) { |
| useAlloca = use; |
| return *this; |
| } |
| /// Callback function to do the allocation of the promoted buffer. If |
| /// std::nullopt, then the default allocation scheme of allocating a |
| /// memref<?xi8> buffer followed by a view operation is used. |
| std::optional<AllocBufferCallbackFn> allocationFn; |
| std::optional<DeallocBufferCallbackFn> deallocationFn; |
| LinalgPromotionOptions & |
| setAllocationDeallocationFns(AllocBufferCallbackFn const &allocFn, |
| DeallocBufferCallbackFn const &deallocFn) { |
| allocationFn = allocFn; |
| deallocationFn = deallocFn; |
| return *this; |
| } |
| /// Callback function to do the copy of data to and from the promoted |
| /// subview. If std::nullopt then a memref.copy is used. |
| std::optional<CopyCallbackFn> copyInFn; |
| std::optional<CopyCallbackFn> copyOutFn; |
| LinalgPromotionOptions &setCopyInOutFns(CopyCallbackFn const ©In, |
| CopyCallbackFn const ©Out) { |
| copyInFn = copyIn; |
| copyOutFn = copyOut; |
| return *this; |
| } |
| }; |
| |
| /// Split Reduction options. |
| struct SplitReductionOptions { |
| // Ratio used to split the reduction dimension. If the ratio is <= 1, |
| // nothing will be done. |
| int64_t ratio = 0; |
| // Index where the extra dimension is added to the intermediate tensor |
| // shape. |
| unsigned index = 0; |
| // If the inner dimension after splitting is parallel or reduction. |
| bool innerParallel = false; |
| }; |
| |
| /// Function signature to control reduction splitting. This returns |
| /// `SplitReductionOptions`. |
| // TODO: don't use unsigned unless doing bit manipulation. |
| using ControlSplitReductionFn = |
| std::function<SplitReductionOptions(LinalgOp op)>; |
| |
| //===----------------------------------------------------------------------===// |
| // Preconditions that ensure the corresponding transformation succeeds and can |
| // be applied as a rewrite pattern. |
| //===----------------------------------------------------------------------===// |
| |
| /// Return true if two `linalg.generic` operations with producer/consumer |
| /// relationship through `fusedOperand` can be fused using elementwise op |
| /// fusion. |
| bool areElementwiseOpsFusable(OpOperand *fusedOperand); |
| |
| /// Promote memref.subviews feeding linalg-on-buffers operations. |
| LogicalResult promoteSubviewsPrecondition(Operation *op, |
| LinalgPromotionOptions options); |
| |
| /// Return success if the operation can be vectorized. |
| LogicalResult vectorizeOpPrecondition(Operation *op, |
| ArrayRef<int64_t> inputVectorSizes = {}, |
| ArrayRef<bool> inputScalableVecDims = {}, |
| bool vectorizeNDExtract = false, |
| bool flatten1DDepthwiseConv = false); |
| |
| //===----------------------------------------------------------------------===// |
| // Transformations exposed as functional-style API calls. |
| //===----------------------------------------------------------------------===// |
| |
| using LinalgLoops = SmallVector<Operation *, 4>; |
| |
| /// Transformation to drop unit-extent dimensions from `linalg.generic` |
| /// operations. |
| struct ControlDropUnitDims { |
| enum class RankReductionStrategy { ReassociativeReshape, ExtractInsertSlice }; |
| |
| RankReductionStrategy rankReductionStrategy = |
| RankReductionStrategy::ReassociativeReshape; |
| |
| using ControlFnTy = std::function<SmallVector<unsigned>(Operation *)>; |
| ControlFnTy controlFn = [](Operation *op) { |
| if (auto genericOp = dyn_cast_or_null<GenericOp>(op)) { |
| return llvm::to_vector(llvm::seq<unsigned>(0, genericOp.getNumLoops())); |
| } |
| if (auto padOp = dyn_cast_or_null<tensor::PadOp>(op)) { |
| return llvm::to_vector( |
| llvm::seq<unsigned>(0, padOp.getSourceType().getRank())); |
| } |
| return SmallVector<unsigned>{}; |
| }; |
| }; |
| LogicalResult dropUnitDims(RewriterBase &rewriter, GenericOp genericOp, |
| const ControlDropUnitDims &options); |
| |
| /// Fuse two `linalg.generic` operations that have a producer-consumer |
| /// relationship captured through `fusedOperand`. The method expects |
| /// that `areElementwiseOpsFusable` returns true for the given `fusedOperand`. |
| struct ElementwiseOpFusionResult { |
| Operation *fusedOp; |
| llvm::DenseMap<Value, Value> replacements; |
| static llvm::SmallDenseSet<int> |
| getPreservedProducerResults(GenericOp producer, GenericOp consumer); |
| }; |
| FailureOr<ElementwiseOpFusionResult> |
| fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand); |
| |
| /// Try to peel and canonicalize loop `op` and return the new result. |
| /// Also applies affine_min/max bounds simplification on the fly where relevant. |
| // TODO: Add support for scf.parallel and affine.for loops. |
| SmallVector<Value> peelLoop(RewriterBase &rewriter, Operation *op); |
| |
| /// Peel 'loops' and applies affine_min/max bounds simplification on the fly |
| /// where relevant. |
| void peelLoops(RewriterBase &rewriter, ArrayRef<scf::ForOp> loops); |
| |
| /// Pad the iterator dimensions `paddingDimensions` of all `opToPad` operands |
| /// to a static bounding box. The original `opToPad` is cloned and operates on |
| /// the padded tensors. |
| /// |
| /// * "options.padToMultipleOf" indicates that each padding dimension should be |
| /// padded to the specified multiple. |
| /// * Use "options.paddingValues" and "options.packPaddings" to set padding |
| /// value and nofold attribute of the created tensor::PadOps, respectively. |
| /// * The unpadded results (extracted slice of the cloned operation) are |
| /// returned via `replacements`. |
| /// * The tensor::PadOps are returned via `padOps`. |
| /// * "options.copyBackOp" specifies the op type for copying back the unpadded |
| /// result to the original destination tensor. |
| LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad, |
| const LinalgPaddingOptions &options, |
| LinalgOp &paddedOp, |
| SmallVector<Value> &replacements, |
| SmallVector<tensor::PadOp> &padOps); |
| |
| namespace detail { |
| |
| /// Helper struct to hold the results of building a packing loop nest. |
| struct PackingResult { |
| SmallVector<OpFoldResult> offsets, sizes, strides; |
| SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings; |
| GenericOp maybeTransposeOp; |
| tensor::PadOp hoistedPadOp; |
| }; |
| |
| /// Build the packing loop nest required to hoist `opToHoist` above |
| /// `outermostEnclosingForOp`. |
| /// The loop nest is built just before `outermostEnclosingForOp`. |
| FailureOr<PackingResult> |
| buildPackingLoopNest(RewriterBase &rewriter, tensor::PadOp opToHoist, |
| scf::ForOp outermostEnclosingForOp, |
| ArrayRef<int64_t> transposeVector); |
| |
| } // namespace detail |
| |
| /// Mechanically hoist padding operations on tensors by `numLoops` into a new, |
| /// generally larger tensor. This achieves packing of multiple padding ops into |
| /// a larger tensor. On success, `opToHoist` is replaced by the cloned version |
| /// in the packing loop so the caller can continue reasoning about the padding |
| /// operation. If `transposeVector` is non-empty, hoist padding introduces a |
| /// GenericOp to transpose the padded tensor before inserting it into the packed |
| /// tensor. A `transposeVector` can change the storage order of the padded |
| /// tensor but does not change the order of the pack or compute loops. |
| /// |
| /// TODO: In the future, we should consider rewriting as a tensor.pack after |
| /// hoisting since this abstraction is now available. |
| /// |
| /// Example in pseudo-mlir: |
| /// ======================= |
| /// |
| /// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR. |
| /// ``` |
| /// scf.for (%i, %j, %k) |
| /// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32> |
| /// %0 = tensor.pad %st0 low[0, 0] high[...] { |
| /// ^bb0( ... ): |
| /// linalg.yield %pad |
| /// } : tensor<?x?xf32> to tensor<4x8xf32> |
| /// compute(%0) |
| /// ``` |
| /// |
| /// IR resembling the following is produced: |
| /// |
| /// ``` |
| /// scf.for (%i) { |
| /// %packed_init = tensor.empty range(%j) : tensor<?x4x8xf32> |
| /// %packed = scf.for (%k) iter_args(%p : %packed_init) { |
| /// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32> |
| /// %0 = tensor.pad %st0 low[0, 0] high[...] { |
| /// ^bb0( ... ): |
| /// linalg.yield %pad |
| /// } : tensor<?x?xf32> to tensor<4x8xf32> |
| /// %1 = tensor.insert_slice %0 ... |
| /// : tensor<4x8xf32> to tensor<?x4x8xf32> |
| /// scf.yield %1: tensor<?x4x8xf32> |
| /// } -> tensor<?x4x8xf32> |
| /// scf.for (%j, %k) { |
| /// %st0 = tensor.extract_slice %packed [%k, 0, 0][1, 4, 8][1, 1, 1] : |
| /// tensor<?x4x8xf32> to tensor<4x8xf32> |
| /// compute(%st0) |
| /// } |
| /// } |
| /// ``` |
| FailureOr<Value> |
| hoistPaddingOnTensors(RewriterBase &rewriter, tensor::PadOp opToHoist, |
| int64_t numLoops, ArrayRef<int64_t> transposeVector, |
| tensor::PadOp &hoistedOp, |
| SmallVectorImpl<GenericOp> &transposeOps); |
| /// Calls into `hoistPaddingOnTensors` with a local IRRewriter. |
| FailureOr<Value> |
| hoistPaddingOnTensors(tensor::PadOp opToHoist, int64_t numLoops, |
| ArrayRef<int64_t> transposeVector, |
| tensor::PadOp &hoistedOp, |
| SmallVectorImpl<GenericOp> &transposeOps); |
| |
| /// Apply padding and hoisting to `linalgOp` according to the configuration |
| /// specified in `options`. |
| FailureOr<LinalgOp> padAndHoistLinalgOp(RewriterBase &rewriter, |
| LinalgOp linalgOp, |
| const LinalgPaddingOptions &options); |
| |
| /// Split the given `op` into two parts along the given iteration space |
| /// `dimension` at the specified `splitPoint`, and return the two parts. |
| /// If the second part is statically known to be empty, do not create it |
| /// and return nullptr instead. Error state is signalled by returning |
| /// a pair of nullptrs. |
| /// |
| /// For example, the following op: |
| /// |
| /// linalg.matmul ins(%0, %1 : tensor<128x32xf32>, tensor<32x64xf32>) |
| /// outs(%2 : tensor<128x64xf32>) |
| /// |
| /// split along the first dimension at position 42 will result in: |
| /// |
| /// %3 = tensor.extract_slice %0[0, 0][42, 32][1, 1] |
| /// %4 = tensor.extract_slice %2[0, 0][42, 64][1, 1] |
| /// %5 = linalg.matmul ins(%3, %1 : tensor<42x32xf32>, tensor<32x64xf32>) |
| /// outs(%5 : tensor<42x64xf32>) |
| /// %6 = tensor.insert_slice %5 into %2[0, 0][42, 64][1, 1] |
| /// |
| /// %7 = tensor.extract_slice %0[42, 0][86, 32][1, 1] |
| /// %8 = tensor.extract_slice %6[42, 0][86, 64][1, 1] |
| /// %9 = linalg.matmul ins(%7, %1 : tensor<86x32xf32>, tensor<32x64xf32>) |
| /// outs(%8 : tensor<86x64xf32>) |
| /// tensor.insert_slice %5 into %6[42, 0][86, 64][1, 1] |
| /// |
| /// Note that there is no simplification other than constant propagation applied |
| /// to slice extraction and insertion. |
| std::pair<TilingInterface, TilingInterface> splitOp(RewriterBase &rewriter, |
| TilingInterface op, |
| unsigned dimension, |
| OpFoldResult splitPoint); |
| |
| /// Perform standalone tiling of a single LinalgOp by `tileSizes`. |
| /// and permute the loop nest according to `interchangeVector` |
| /// The permutation is expressed as a list of integers that specify |
| /// the new ordering of the loop nest. The length of `interchangeVector` |
| /// must be equal to the length of `tileSizes`. |
| /// An empty vector is interpreted as the identity permutation and the |
| /// transformation returns early. |
| /// |
| /// Return a struct containing the tiled loops in the specified order |
| /// and the cloned op if successful, std::nullopt otherwise. |
| /// |
| /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed by |
| /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be |
| /// integers, in the range 0..`tileSizes.size()` without duplications |
| /// (i.e. `[1,1,2]` is an invalid permutation). |
| struct TiledLinalgOp { |
| LinalgOp op; |
| SmallVector<Operation *, 8> loops; |
| SmallVector<Value, 4> tensorResults; |
| }; |
| FailureOr<TiledLinalgOp> tileLinalgOp(RewriterBase &b, LinalgOp op, |
| const LinalgTilingOptions &options); |
| |
| /// Interchange the `iterator_types` and `iterator_maps` dimensions and adapts |
| /// the index accesses of `op`. This is an in-place transformation controlled |
| /// by `interchangeVector`. An empty vector is interpreted as the identity |
| /// permutation and the transformation returns early. |
| /// |
| /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed with |
| /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be |
| /// integers, in the range 0..`op.rank` without duplications |
| /// (i.e. `[1,1,2]` is an invalid permutation). |
| /// |
| /// Return failure if the permutation is not valid. |
| FailureOr<GenericOp> interchangeGenericOp(RewriterBase &rewriter, |
| GenericOp genericOp, |
| ArrayRef<unsigned> interchangeVector); |
| |
| /// Create a GenericOp from the given named operation `namedOp` and replace |
| /// namedOp. |
| /// Return failure if `namedOp` is a GenericOp or misses a region builder. |
| FailureOr<GenericOp> generalizeNamedOp(RewriterBase &rewriter, |
| LinalgOp namedOp); |
| |
| /// Create a namedOp from the given GenericOp and replace the GenericOp. |
| /// Currently we can specialize only trivial linalg copy operations. |
| FailureOr<LinalgOp> specializeGenericOp(RewriterBase &rewriter, |
| GenericOp genericOp); |
| |
| /// Create a new buffer using the `allocationFn` provided. The size of this |
| /// buffer is the smallest constant bounding size along each dimension that |
| /// can be computed for the size of the result of `subView`. Returns the |
| /// allocated buffer as `fullLocalView` and the view that matches the size of |
| /// the result of subview operation as `partialLocalView`. |
| struct PromotionInfo { |
| Value fullLocalView; |
| Value partialLocalView; |
| }; |
| FailureOr<PromotionInfo> |
| promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView, |
| const AllocBufferCallbackFn &allocationFn, |
| DataLayout &layout); |
| |
| /// Promote the `subViews` into a new buffer allocated at the insertion point |
| /// `b`. Promotion occurs in 3 steps: |
| /// 1. Create a new buffer for a full tile (i.e. not clipped at the |
| /// boundary). |
| /// 2. Take a full view on the buffer. |
| /// 3. Take a partial slice of the full view in step 2. and copy into it. |
| /// |
| /// Return the modified linalg op (the modification happens in place) as well |
| /// as all the copy ops created. |
| FailureOr<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op, |
| const LinalgPromotionOptions &options); |
| |
| /// Allocate the subview in the GPU workgroup memory. |
| std::optional<Value> allocateWorkgroupMemory(OpBuilder &builder, |
| memref::SubViewOp subview, |
| ArrayRef<Value> sizeBounds, |
| DataLayout &); |
| |
| /// In case of GPU group memory there is no need to deallocate. |
| LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value /*buffer*/); |
| |
| /// Create Memref copy operations and add gpu barrier guards before and after |
| /// the copy operation to ensure data integrity. |
| LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst); |
| |
| /// Allocate the subview in the GPU private memory. |
| std::optional<Value> allocateGPUPrivateMemory(OpBuilder &builder, |
| memref::SubViewOp subview, |
| ArrayRef<Value> sizeBounds, |
| DataLayout &); |
| |
| /// Normal copy to between src and dst. |
| LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst); |
| |
| /// In case of GPU private memory there is no need to deallocate since the |
| /// memory is freed when going outside of the scope. |
| LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value /*buffer*/); |
| |
| /// Emit a suitable vector form for an operation. If provided, |
| /// `inputVectorSizes` are used to vectorize this operation. `inputVectorSizes` |
| /// must match the rank of the iteration space of the operation and the sizes |
| /// must be smaller or equal than their counterpart interation space sizes, if |
| /// static. `inputVectorShapes` also allows the vectorization of operations with |
| /// dynamic shapes. |
| LogicalResult vectorize(RewriterBase &rewriter, Operation *op, |
| ArrayRef<int64_t> inputVectorSizes = {}, |
| ArrayRef<bool> inputScalableVecDims = {}, |
| bool vectorizeNDExtract = false, |
| bool flatten1DDepthwiseConv = false); |
| |
| /// Emit a suitable vector form for a Copy op with fully static shape. |
| LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp); |
| |
| /// Emit a loop nest of `scf.for` with the proper body for `linalgOp`. |
| FailureOr<LinalgLoops> linalgOpToLoops(RewriterBase &rewriter, |
| LinalgOp linalgOp); |
| |
| /// Emit a loop nest of `scf.parallel` with the proper body for `linalgOp`. |
| FailureOr<LinalgLoops> linalgOpToParallelLoops(RewriterBase &rewriter, |
| LinalgOp linalgOp); |
| |
| /// Emit a loop nest of `affine.for` with the proper body for `linalgOp`. |
| FailureOr<LinalgLoops> linalgOpToAffineLoops(RewriterBase &rewriter, |
| LinalgOp linalgOp); |
| |
| /// Creates a number of ranges equal to the number of non-zero in `tileSizes`. |
| /// One for each loop of the LinalgOp that is tiled. The `tileSizes` argument |
| /// has one entry per surrounding loop. It uses zero as the convention that a |
| /// particular loop is not tiled. This convention simplifies implementations |
| /// by avoiding affine map manipulations. The returned ranges correspond to |
| /// the loop ranges, in the proper order, that are tiled and for which new |
| /// loops will be created. Also the function returns a map from loop indices |
| /// of the LinalgOp to the corresponding non-empty range indices of newly |
| /// created loops. |
| using LoopIndexToRangeIndexMap = DenseMap<int, int>; |
| std::tuple<SmallVector<Range, 4>, LoopIndexToRangeIndexMap> |
| makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map, |
| ArrayRef<OpFoldResult> allShapeSizes, |
| ArrayRef<OpFoldResult> allTileSizes); |
| |
| namespace detail { |
| template <typename T> |
| struct MultiSizeSpecificationBase { |
| /// Tile sizes. |
| T lowTileSize, highTileSize; |
| /// Number of tiles associated with each size. |
| T lowTripCount, highTripCount; |
| }; |
| } // namespace detail |
| |
| /// A description of a multi-size tiling comprising tile sizes and numbers of |
| /// tiles, expressed as Values which may or may not be constant. Multi-size |
| /// currently means two-size. |
| struct MultiSizeSpecification |
| : public detail::MultiSizeSpecificationBase<Value> {}; |
| struct StaticMultiSizeSpecification |
| : public detail::MultiSizeSpecificationBase<int64_t> {}; |
| |
| /// Emits the IR computing the multi-sized tiling specification with two tile |
| /// sizes not exceeding `targetSize`, each divisible by `sizeDivisor`, such |
| /// that there exist numbers of tiles with these sizes that fully cover the |
| /// given iteration space `dimension` of the structured `op`. |
| /// |
| /// The computation is as follows: |
| /// |
| /// b = originalTripCount floordiv sizeDivisor |
| /// t = (targetSize + sizeDivisor - 1) floordiv sizeDivisor |
| /// d = (b + t - 1) floordiv t |
| /// s = (b floordiv d) * sizeDivisor |
| /// v = b % d |
| /// u = d - v |
| /// |
| /// where the tile sizes are `s` and `s` + `sizeDivisor`, and the numbers of |
| /// the corresponding tiles are `u` and `v`, respectively. Alternatively, |
| /// |
| /// s * u + (s + sizeDivisor) * v == original size, |
| /// where s mod sizeDivisor = 0. |
| /// |
| /// Expects all values to be positive. In some cases with the target tile size |
| /// sufficiently close to the dimension shape and non-unit divisor, it is |
| /// impossible to compute such sizes. If `emitAssertion` is set, also emit the |
| /// assertion that size computation succeeded. |
| /// |
| /// Returns the specification consisting of both tile values and the number of |
| /// tiles of each size. |
| FailureOr<MultiSizeSpecification> |
| computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension, |
| OpFoldResult targetSize, OpFoldResult divisor, |
| bool emitAssertions = true); |
| FailureOr<StaticMultiSizeSpecification> |
| computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize, |
| int64_t divisor); |
| |
| /// Rewrite a TilingInterface `op` to a tiled `scf.forall`, applying |
| /// tiling by `numThreads`. |
| /// If non-empty, the `mapping` is added as an attribute to the |
| /// resulting `scf.forall`. |
| /// Zero tile sizes indicate that the dimension is not tiled, and can be |
| /// thought of as tiling by the full size of data. It is the user's |
| /// responsibility to ensure that `numThreads` is a valid tiling specification |
| /// (i.e. that only tiles parallel dimensions, e.g. in the Linalg case). |
| struct ForallTilingResult { |
| Operation *tileOp; |
| Operation *tiledOp; |
| }; |
| FailureOr<ForallTilingResult> tileToForallOp(RewriterBase &builder, |
| TilingInterface op, |
| ArrayRef<OpFoldResult> numThreads, |
| std::optional<ArrayAttr> mapping); |
| |
| /// Same as `tileToForallOp`, but calculate the number of threads |
| /// required using the given tileSizes. |
| FailureOr<ForallTilingResult> |
| tileToForallOpUsingTileSizes(RewriterBase &builder, TilingInterface op, |
| ArrayRef<OpFoldResult> tileSizes, |
| std::optional<ArrayAttr> mapping); |
| |
| /// Transformation information returned after reduction tiling. |
| struct ForallReductionTilingResult { |
| /// The partial reduction tiled op generated. |
| Operation *parallelTiledOp; |
| /// The final reduction operation merging all the partial reductions. |
| Operation *mergeOp; |
| /// The op initializing the tensor used for partial reductions. |
| Operation *initialOp; |
| /// The `scf.forall` operation that iterate over the tiles. |
| scf::ForallOp loops; |
| }; |
| |
| /// Method to tile a reduction to parallel iterations computing partial |
| /// reductions. After the loop all the partial reduction are merged into a final |
| /// reduction. For example for the following sequence |
| /// |
| /// ```mlir |
| /// %0 = linalg.generic %in ["parallel", "reduction"] |
| /// : tensor<7x9xf32> -> tensor<7xf32> |
| /// ``` |
| /// |
| /// into: |
| /// |
| /// ```mlir |
| /// %0 = linalg.fill ... : tensor<7x4xf32> |
| /// %1 = scf.forall (%iv) in (%c4) shared_outs(%arg0 = %0) |
| /// -> (tensor<7x4xf32>) { |
| /// %2 = tensor.extract_slice %arg3 : tensor<7x4xf32> to tensor<7xf32> |
| /// %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32> |
| /// %4 = linalg.generic %2, %3 ["parallel", "reduction"] |
| /// : tensor<7x?xf32> -> tensor<7xf32> |
| /// %5 = tensor.insert_slice %3, %arg0[0, %iv] : tensor<7x4xf32> |
| /// } |
| /// %6 = linalg.generic %1 ["parallel", "reduction"] |
| /// : tensor<7x4xf32> -> tensor<7xf32> |
| /// ``` |
| FailureOr<ForallReductionTilingResult> |
| tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op, |
| ArrayRef<OpFoldResult> numThreads, |
| ArrayRef<OpFoldResult> tileSizes = {}, |
| std::optional<ArrayAttr> mapping = std::nullopt); |
| |
| /// All indices returned by IndexOp should be invariant with respect to |
| /// tiling. Therefore, if an operation is tiled, we have to transform the |
| /// indices accordingly, i.e. offset them by the values of the corresponding |
| /// induction variables that are captured implicitly in the body of the op. |
| /// |
| /// Example. `linalg.generic` before tiling: |
| /// |
| /// #id_2d = (i, j) -> (i, j) |
| /// #pointwise_2d_trait = { |
| /// indexing_maps = [#id_2d, #id_2d], |
| /// iterator_types = ["parallel", "parallel"] |
| /// } |
| /// linalg.generic #pointwise_2d_trait %operand, %result { |
| /// ^bb0(%operand_in: f32, %result_in: f32): |
| /// %i = linalg.index 0 : index |
| /// %j = linalg.index 1 : index |
| /// <some operations that use %i, %j> |
| /// }: memref<50x100xf32>, memref<50x100xf32> |
| /// |
| /// After tiling pass with tiles sizes 10 and 25: |
| /// |
| /// #strided = (i, j)[s0, s1, s2] -> (i * s1 + s0 + j * s2) |
| /// |
| /// %c1 = arith.constant 1 : index |
| /// %c0 = arith.constant 0 : index |
| /// %c25 = arith.constant 25 : index |
| /// %c10 = arith.constant 10 : index |
| /// operand_dim_0 = dim %operand, 0 : memref<50x100xf32> |
| /// operand_dim_1 = dim %operand, 1 : memref<50x100xf32> |
| /// scf.for %k = %c0 to operand_dim_0 step %c10 { |
| /// scf.for %l = %c0 to operand_dim_1 step %c25 { |
| /// %4 = memref.subview %operand[%k, %l][%c10, %c25][%c1, %c1] |
| /// : memref<50x100xf32> to memref<?x?xf32, #strided> |
| /// %5 = memref.subview %result[%k, %l][%c10, %c25][%c1, %c1] |
| /// : memref<50x100xf32> to memref<?x?xf32, #strided> |
| /// linalg.generic pointwise_2d_trait %4, %5 { |
| /// ^bb0(%operand_in: f32, %result_in: f32): |
| /// %i = linalg.index 0 : index |
| /// %j = linalg.index 1 : index |
| /// // Indices `k` and `l` are implicitly captured in the body. |
| /// %transformed_i = arith.addi %i, %k : index // index `i` is offset by |
| /// %k %transformed_j = arith.addi %j, %l : index // index `j` is offset |
| /// by %l |
| /// // Every use of %i, %j is replaced with %transformed_i, |
| /// %transformed_j <some operations that use %transformed_i, |
| /// %transformed_j> |
| /// }: memref<?x?xf32, #strided>, memref<?x?xf32, #strided> |
| /// } |
| /// } |
| /// |
| /// TODO: Investigate whether mixing implicit and explicit indices |
| /// does not lead to losing information. |
| void transformIndexOps(RewriterBase &b, LinalgOp op, |
| SmallVectorImpl<Value> &ivs, |
| const LoopIndexToRangeIndexMap &loopIndexToRangeIndex); |
| |
| /// Apply transformation to split the single linalg op reduction into a |
| /// parallel and reduction dimension. Then create a new linalg.generic op |
| /// doing the rest of the reduction. Return the new linalg op with an extra |
| /// parallel dimension or failure if the transformation didn't happen. |
| /// |
| /// Example: |
| /// ``` |
| /// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, |
| /// affine_map<(d0) -> ()>], |
| /// iterator_types = ["reduction"]} |
| /// ins(%in : tensor<32xf32>) |
| /// outs(%out : tensor<f32>) { |
| /// ^bb0(%arg1: f32, %arg2: f32): |
| /// %y = arith.addf %arg1, %arg2 : f32 |
| /// linalg.yield %y : f32 |
| /// } -> tensor<f32> |
| /// ``` |
| /// To: |
| /// ``` |
| /// %cst = arith.constant 0.000000e+00 : f32 |
| /// %0 = tensor.expand_shape %in [[0, 1]] : tensor<32xf32> into |
| /// tensor<4x8xf32> %1 = tensor.empty [4] : tensor<4xf32> %2 = linalg.fill |
| /// ins(%cst : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32> %3 = |
| /// linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, |
| /// affine_map<(d0, d1) -> (d0)>], |
| /// iterator_types = ["parallel", "reduction"]} |
| /// ins(%0 : tensor<4x8xf32>) outs(%2 : tensor<4xf32>) { |
| /// ^bb0(%arg3: f32, %arg5: f32): |
| /// %5 = arith.addf %arg3, %arg4 : f32 |
| /// linalg.yield %5 : f32 |
| /// } -> tensor<4xf32> |
| /// %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, |
| /// affine_map<(d0) -> ()>], |
| /// iterator_types = ["reduction"]} |
| /// ins(%3 : tensor<4xf32>) outs(%out : tensor<f32>) { |
| /// ^bb0(%arg3: f32, %arg4: f32): |
| /// %5 = arith.addf %arg3, %arg4 : f32 |
| /// linalg.yield %5 : f32 |
| /// } -> tensor<f32> |
| /// ``` |
| struct SplitReductionResult { |
| Operation *initOrAlloc; |
| FillOp fillOp; |
| LinalgOp splitLinalgOp; |
| LinalgOp resultCombiningLinalgOp; |
| }; |
| FailureOr<SplitReductionResult> |
| splitReduction(RewriterBase &b, LinalgOp op, |
| const ControlSplitReductionFn &controlSplitReductionFn, |
| bool useAlloc = false); |
| |
| /// Scaling-based implementation of the split reduction transformation. |
| /// Instead of introducing an ExpandShapeOp, this rewrites a reduction |
| /// dimension `k` into `k * scale + kk`. |
| /// |
| /// Example: |
| /// ``` |
| /// %0 = linalg.matmul ins(%A, %B: tensor<16x256xf32>, tensor<256x32xf32>) |
| /// outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32> |
| /// ``` |
| /// |
| /// Is transformed to: |
| /// |
| /// ``` |
| /// #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2 * 4 + d3)> |
| /// #map1 = affine_map<(d0, d1, d2, d3) -> (d2 * 4 + d3, d1)> |
| /// #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)> |
| /// #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> |
| /// #map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> |
| /// #map5 = affine_map<(d0, d1, d2) -> (d0, d1)> |
| /// %0 = tensor.empty [16, 32, 64] : tensor<16x32x64xf32> |
| /// %cst = arith.constant 0.000000e+00 : f32 |
| /// %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<16x32x64xf32>) -> |
| /// tensor<16x32x64xf32> |
| /// %2 = tensor.empty [64, 4] : tensor<64x4xi1> |
| /// |
| /// %3 = linalg.generic {indexing_maps = [#map0, #map1, #map2, #map3], |
| /// iterator_types = ["parallel", "parallel", "parallel", "reduction"]} |
| /// ins(%A, %B, %2 : tensor<16x256xf32>, tensor<256x32xf32>, |
| /// tensor<64x4xi1>) |
| /// outs(%1 : tensor<16x32x64xf32>) { |
| /// ^bb0(%arg3: f32, %arg4: f32, %arg5: i1, %arg6: f32): |
| /// %5 = arith.mulf %arg3, %arg4 : f32 |
| /// %6 = arith.addf %arg6, %5 : f32 |
| /// linalg.yield %6 : f32 |
| /// } -> tensor<16x32x64xf32> |
| /// |
| /// %4 = linalg.generic {indexing_maps = [#map4, #map5], |
| /// iterator_types = ["parallel", "parallel", "reduction"]} |
| // ins(%3 : tensor<16x32x64xf32>) |
| /// outs(%C : tensor<16x32xf32>) { |
| /// ^bb0(%arg3: f32, %arg4: f32): |
| /// %5 = arith.addf %arg3, %arg4 : f32 |
| /// linalg.yield %5 : f32 |
| /// } -> tensor<16x32xf32> |
| /// |
| /// return %4 : tensor<16x32xf32> |
| /// ``` |
| FailureOr<SplitReductionResult> |
| splitReductionByScaling(RewriterBase &b, LinalgOp op, |
| const ControlSplitReductionFn &controlSplitReductionFn, |
| bool useAlloc = false); |
| |
| /// Return `true` if a given sequence of dimensions are contiguous in the |
| /// range of the specified indexing map. |
| bool isDimSequencePreserved(AffineMap map, ReassociationIndicesRef dimSequence); |
| /// Return `true` if all sequences of dimensions specified in `dimSequences` are |
| /// contiguous in all the ranges of the `maps`. |
| bool areDimSequencesPreserved(ArrayRef<AffineMap> maps, |
| ArrayRef<ReassociationIndices> dimSequences); |
| |
| struct CollapseResult { |
| SmallVector<Value> results; |
| LinalgOp collapsedOp; |
| }; |
| |
| /// Collapses dimensions of linalg.generic/linalg.copy operation. A precondition |
| /// to calling this method is that for each list in `foldedIterationDim`, the |
| /// sequence of dimensions is contiguous in domains of all `indexing_maps` of |
| /// the `linalgOp`. This can be checked using `areDimSequencePreserved` method. |
| /// When valid, the method also collapses the operands of the op. Returns |
| /// replacement values of the results of the original `linalgOp` by inserting |
| /// reshapes to get back values of compatible types. |
| FailureOr<CollapseResult> |
| collapseOpIterationDims(LinalgOp op, |
| ArrayRef<ReassociationIndices> foldedIterationDims, |
| RewriterBase &rewriter); |
| |
| struct LowerPackResult { |
| tensor::PadOp padOp; |
| tensor::ExpandShapeOp expandShapeOp; |
| linalg::TransposeOp transposeOp; |
| }; |
| |
| /// Rewrite pack as pad + reshape + transpose. |
| FailureOr<LowerPackResult> lowerPack(RewriterBase &rewriter, |
| tensor::PackOp packOp); |
| |
| struct LowerUnPackOpResult { |
| tensor::EmptyOp emptyOp; |
| linalg::TransposeOp transposeOp; |
| tensor::CollapseShapeOp collapseShapeOp; |
| tensor::ExtractSliceOp extractSliceOp; |
| }; |
| |
| /// Rewrite pack as empty + transpose + reshape + extract_slice. |
| FailureOr<LowerUnPackOpResult> lowerUnPack(RewriterBase &rewriter, |
| tensor::UnPackOp unPackOp); |
| |
| /// Struct to hold the result of a `pack` call. |
| struct PackResult { |
| SmallVector<tensor::PackOp> packOps; |
| linalg::LinalgOp packedLinalgOp; |
| SmallVector<tensor::UnPackOp> unPackOps; |
| }; |
| /// Implement packing of a single LinalgOp by `packedSizes`. |
| /// There must be one packedSizes entry per `linalgOp` iterator. |
| /// Return the packed Linalg op on success, failure otherwise. |
| FailureOr<PackResult> pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp, |
| ArrayRef<OpFoldResult> packedSizes); |
| |
| /// Struct to hold the result of a `packTranspose` call. |
| struct PackTransposeResult { |
| tensor::PackOp transposedPackOp; |
| linalg::LinalgOp transposedLinalgOp; |
| tensor::UnPackOp transposedUnPackOp; |
| }; |
| /// Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the |
| /// transposed PackOp -> LinalgOp -> UnPackOp chain after replacements. |
| /// Return failure if either: |
| /// 1. the `packOp` does not have the `linalgOp` as its unique use. |
| /// 2. the `maybeUnPackOp`, if specified must be a consumer of the result tied |
| /// to the unique `packOp` use. |
| /// 3. `outerPerm` (resp. `innerPerm`) must be valid permutations of |
| /// `packOp.getOuterDimsPerm` (resp. `packOp.getInnerDimsPerm`) or empty. |
| FailureOr<PackTransposeResult> |
| packTranspose(RewriterBase &rewriter, tensor::PackOp packOp, |
| linalg::LinalgOp linalgOp, tensor::UnPackOp maybeUnPackOp, |
| ArrayRef<int64_t> outerPerm, ArrayRef<int64_t> innerPerm); |
| |
| /// Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m |
| /// and n are proper parallel dimensions and k is a proper reduction |
| /// dimension. Packing occurs by rewriting the op as a linalg.generic and |
| /// calling linalg::pack by `mnkPackedSizes`. The order of the packed |
| /// dimensions is customizable: the `mnkOrder` is a permutation of {0, 1, 2} |
| /// to reorder {m, n, k} into one of the 8 possible forms. The outer |
| /// dimensions of the operands are not permuted at this time, this is left for |
| /// future work. |
| FailureOr<PackResult> |
| packMatmulGreedily(RewriterBase &rewriter, LinalgOp linalgOp, |
| ArrayRef<OpFoldResult> mnkPackedSizes, |
| ArrayRef<int64_t> mnkPaddedSizesNextMultipleOf, |
| ArrayRef<int64_t> mnkOrder); |
| |
| /// Rewrite tensor.from_elements to linalg.generic. |
| FailureOr<Operation *> |
| rewriteInDestinationPassingStyle(RewriterBase &rewriter, |
| tensor::FromElementsOp fromElementsOp); |
| |
| /// Rewrite tensor.generate to linalg.generic. |
| FailureOr<Operation *> |
| rewriteInDestinationPassingStyle(RewriterBase &rewriter, |
| tensor::GenerateOp generateOp); |
| |
| /// Rewrite tensor.pad to linalg.generic + tensor.insert_slice. |
| FailureOr<Operation *> rewriteInDestinationPassingStyle(RewriterBase &rewriter, |
| tensor::PadOp padOp); |
| |
| /// Convert linalg.conv_2d_nhwc_hwcf into linalg.generic (for img2col packing) |
| /// and linalg.matmul. |
| /// |
| /// A convolution operation can be written as a matrix-matrix multiplication by |
| /// unfolding the cross-correlation between input and filter and explicitly copy |
| /// overlapped sliding window inputs. |
| /// |
| /// Consider 2D input X with single channel input and output and 2x2 filter W: |
| /// [x(0, 0) , x(0, 1) , ..., x(0, n) ] |
| /// [x(1, 0) , x(1, 1) , ..., x(1, n) ] |
| /// [. , . ,. , . ] [w(0, 0), w(0, 1)] |
| /// [. , . , . , . ] (conv) [w(1, 0), w(1, 1)] |
| /// [. , . , ., . ] |
| /// [x(n-1, 0), x(n-1, 1), ..., x(n-1, n-1)] |
| /// |
| /// The packed input data (img2col) is a matrix with |rows| = output spatial |
| /// size, |columns| = filter spatial size. To compute the output Y(i, j) we need |
| /// to calculate the dot product between filter window at input X(x, y)) and the |
| /// filter which will look like the following where r.h.s is the img2col matrix |
| /// and l.h.s is the flattened filter: |
| /// |
| /// [x(0,0), x(0,1), x(1,0), x(1,1)] |
| /// [x(0,1), x(1,1), x(0,2), x(1,2)] (matmul) [w(0,0), w(0,1), w(1,0), w(1,1)] |
| /// [x(0,1), x(1,1), x(0,2), x(1,2)] |
| /// [ . , . , . , . ] |
| /// |
| /// In general for 2D case with (N, H, W, C) input and (Kh, Kw, C, D) filter |
| /// and output (N, Ho, Wo, D) the convolution is the following matrix-matrix |
| /// multiplication (Ho x Wo, Kh x Kw x C) * (Kh x Kw x C, D) for each input in |
| /// the N input. For the case where N > 1 its a batched matrix-matrix |
| /// multiplication. |
| /// |
| /// On success, return both the operation that produces the img2col tensor and |
| /// the final operation of the sequence that replaces the original convolution. |
| FailureOr<std::pair<Operation *, Operation *>> |
| rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp); |
| |
| /// Same as the above but for Fhwc channel orderings in the filter. In this case |
| /// the matrix multiplication is actually a row-wise dot-product rather than a |
| /// row-column dot-product. This is to avoid transposing the filter matrix which |
| /// would be required for a regular matrix multiplication to produce the correct |
| /// output dimensions. |
| FailureOr<std::pair<Operation *, Operation *>> |
| rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp); |
| |
| /// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except there is no |
| /// reduction among the input channels so each convolution can be a |
| /// matrix-vector product and by transposing both input filter so channels are |
| /// outer most the computation is a batched matrix-vector product. |
| FailureOr<std::pair<Operation *, Operation *>> |
| rewriteInIm2Col(RewriterBase &rewriter, |
| linalg::DepthwiseConv2DNhwcHwcOp convOp); |
| |
| /// Similar to rewriteInIm2Col with linalg::Conv2DNhwcHwcfOp except because the |
| /// channels are to the left of the image shape dimensions, the position of the |
| /// contraction dimension in the resulting matmul is reversed. This swaps the |
| /// LHS and RHS of the matmul when compared with nhwc (i.e. (D, C x Kh x Kw) * |
| /// (C x Kh x Kw, Ho x Wo)) |
| FailureOr<std::pair<Operation *, Operation *>> |
| rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNchwFchwOp convOp); |
| |
| /// Convert linalg.conv_2d_nhwc_fhwc(_q) to linalg.conv_2d_nhwc_hwcf(_q) by |
| /// materializing transpose. |
| FailureOr<Operation *> transposeConv2D(RewriterBase &rewriter, |
| linalg::Conv2DNhwcFhwcOp op); |
| FailureOr<Operation *> transposeConv2D(RewriterBase &rewriter, |
| linalg::Conv2DNhwcFhwcQOp op); |
| |
| /// Convert Linalg matmul ops to transposed variants. |
| FailureOr<Operation *> transposeMatmul(RewriterBase &rewriter, |
| linalg::MatmulOp op, |
| bool transposeLHS = true); |
| FailureOr<Operation *> transposeBatchMatmul(RewriterBase &rewriter, |
| linalg::BatchMatmulOp op, |
| bool transposeLHS = true); |
| |
| //===----------------------------------------------------------------------===// |
| // Rewrite patterns wrapping transformations. |
| // TODO: every single such pattern should be a close to noop wrapper around a |
| // functional-stye API call. |
| //===----------------------------------------------------------------------===// |
| |
| /// Rewrites 2-D convolution ops with size-1 window dimensions into 1-D |
| /// convolution ops. |
| template <typename Conv2DOp, typename Conv1DOp> |
| struct DownscaleSizeOneWindowed2DConvolution final |
| : public OpRewritePattern<Conv2DOp> { |
| using OpRewritePattern<Conv2DOp>::OpRewritePattern; |
| |
| FailureOr<Conv1DOp> returningMatchAndRewrite(Conv2DOp convOp, |
| PatternRewriter &rewriter) const; |
| |
| LogicalResult matchAndRewrite(Conv2DOp convOp, |
| PatternRewriter &rewriter) const override { |
| return returningMatchAndRewrite(convOp, rewriter); |
| } |
| }; |
| |
| extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNhwcHwcfOp, |
| Conv1DNwcWcfOp>; |
| extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNchwFchwOp, |
| Conv1DNcwFcwOp>; |
| |
| /// Rewrites 2-D depthwise convolution ops with size-1 (w, kw) or (h, kh) |
| /// dimensions into 1-D depthwise convolution ops. |
| struct DownscaleDepthwiseConv2DNhwcHwcOp final |
| : public OpRewritePattern<DepthwiseConv2DNhwcHwcOp> { |
| DownscaleDepthwiseConv2DNhwcHwcOp(MLIRContext *context, |
| PatternBenefit benefit = 1) |
| : OpRewritePattern<DepthwiseConv2DNhwcHwcOp>(context, benefit) {} |
| |
| FailureOr<DepthwiseConv1DNwcWcOp> |
| returningMatchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp, |
| PatternRewriter &rewriter) const; |
| |
| LogicalResult matchAndRewrite(DepthwiseConv2DNhwcHwcOp convOp, |
| PatternRewriter &rewriter) const override { |
| return returningMatchAndRewrite(convOp, rewriter); |
| } |
| }; |
| |
| struct DownscaleConv2DOp final : public OpRewritePattern<Conv2DOp> { |
| DownscaleConv2DOp(MLIRContext *context, PatternBenefit benefit = 1) |
| : OpRewritePattern<Conv2DOp>(context, benefit) {} |
| |
| FailureOr<Conv1DOp> returningMatchAndRewrite(Conv2DOp convOp, |
| PatternRewriter &rewriter) const; |
| |
| LogicalResult matchAndRewrite(Conv2DOp convOp, |
| PatternRewriter &rewriter) const override { |
| return returningMatchAndRewrite(convOp, rewriter); |
| } |
| }; |
| |
| /// |
| /// Linalg generalization pattern. |
| /// |
| /// Apply the `generalization` transformation as a pattern. |
| /// See `generalization` for more details. |
| // |
| // TODO: Automatic default pattern class that just unwraps a function |
| // returning FailureOr<GenericOp>. |
| struct LinalgGeneralizationPattern |
| : public OpInterfaceRewritePattern<LinalgOp> { |
| using OpInterfaceRewritePattern<LinalgOp>::OpInterfaceRewritePattern; |
| |
| /// `matchAndRewrite` implementation that returns the significant |
| /// transformed pieces of IR. |
| FailureOr<GenericOp> |
| returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const { |
| return generalizeNamedOp(rewriter, op); |
| } |
| |
| LogicalResult matchAndRewrite(LinalgOp op, |
| PatternRewriter &rewriter) const override { |
| return returningMatchAndRewrite(op, rewriter); |
| } |
| }; |
| |
| /// Vectorization pattern for memref::CopyOp. |
| struct CopyVectorizationPattern : public OpRewritePattern<memref::CopyOp> { |
| using OpRewritePattern<memref::CopyOp>::OpRewritePattern; |
| |
| LogicalResult matchAndRewrite(memref::CopyOp copyOp, |
| PatternRewriter &rewriter) const override; |
| }; |
| |
| using OptimizeCopyFn = |
| std::function<LogicalResult(RewriterBase &, tensor::PadOp, Value)>; |
| |
| /// Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and |
| /// InsertSliceOp. For now, only constant padding values are supported. |
| /// `OptimizeCopyFn` can be used to customize copying step optimization. |
| struct GeneralizePadOpPattern : public OpRewritePattern<tensor::PadOp> { |
| GeneralizePadOpPattern(MLIRContext *context, |
| OptimizeCopyFn optimizeCopyFn = nullptr, |
| PatternBenefit benefit = 1) |
| : OpRewritePattern<tensor::PadOp>(context, benefit), |
| optimizeCopyFn(std::move(optimizeCopyFn)) {} |
| LogicalResult matchAndRewrite(tensor::PadOp padOp, |
| PatternRewriter &rewriter) const override; |
| |
| protected: |
| OptimizeCopyFn optimizeCopyFn; |
| Value createFillOrGenerateOp(RewriterBase &rewriter, tensor::PadOp padOp, |
| Value dest, |
| const SmallVector<Value> &dynSizes) const; |
| }; |
| |
| /// Rewrites a tensor::PackOp into a sequence of tensor.pad + linalg.transpose + |
| /// tensor.insert_slice ops, where the tensor::PackOp has outer dims being all |
| /// 1s. |
| struct GeneralizeOuterUnitDimsPackOpPattern |
| : public OpRewritePattern<tensor::PackOp> { |
| using OpRewritePattern<tensor::PackOp>::OpRewritePattern; |
| LogicalResult matchAndRewrite(tensor::PackOp packOp, |
| PatternRewriter &rewriter) const override; |
| }; |
| |
| /// Rewrites a tensor::UnPackOp into a sequence of rank-reduced extract_slice op |
| /// + transpose op + insert_slice op, where the tensor::UnPackOp has outer dims |
| /// being all 1s. |
| struct GeneralizeOuterUnitDimsUnPackOpPattern |
| : public OpRewritePattern<tensor::UnPackOp> { |
| using OpRewritePattern<tensor::UnPackOp>::OpRewritePattern; |
| LogicalResult matchAndRewrite(tensor::UnPackOp unpackOp, |
| PatternRewriter &rewriter) const override; |
| }; |
| |
| /// Match and rewrite for the pattern: |
| /// ``` |
| /// %alloc = ... |
| /// [optional] %view = memref.view %alloc ... |
| /// %subView = subview %allocOrView ... |
| /// [optional] linalg.fill(%allocOrView, %cst) ... |
| /// ... |
| /// memref.copy(%in, %subView) ... |
| /// vector.transfer_read %allocOrView[...], %cst ... |
| /// ``` |
| /// into |
| /// ``` |
| /// [unchanged] %alloc = ... |
| /// [unchanged] [optional] %view = memref.view %alloc ... |
| /// [unchanged] [unchanged] %subView = subview %allocOrView ... |
| /// ... |
| /// vector.transfer_read %in[...], %cst ... |
| /// ``` |
| /// Where there is no interleaved use between memref.copy and transfer_read as |
| /// well as no interleaved use between linalg.fill and memref.copy (if |
| /// linalg.fill is specified). |
| /// This is a custom rewrite to forward partial reads (with optional fills) to |
| /// vector.transfer_read. |
| struct LinalgCopyVTRForwardingPattern |
| : public OpRewritePattern<vector::TransferReadOp> { |
| using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern; |
| |
| LogicalResult matchAndRewrite(vector::TransferReadOp xferOp, |
| PatternRewriter &rewriter) const override; |
| }; |
| |
| /// Match and rewrite for the pattern: |
| /// ``` |
| /// %alloc = ... |
| /// [optional] %view = memref.view %alloc ... |
| /// %subView = subview %allocOrView... |
| /// ... |
| /// vector.transfer_write %..., %allocOrView[...] |
| /// memref.copy(%subView, %out) |
| /// ``` |
| /// into |
| /// ``` |
| /// [unchanged] %alloc = ... |
| /// [unchanged] [optional] %view = memref.view %alloc ... |
| /// [unchanged] %subView = subview %allocOrView... |
| /// ... |
| /// vector.transfer_write %..., %out[...] |
| /// ``` |
| /// Where there is no interleaved use between transfer_write and memref.copy. |
| /// This is a custom rewrite to forward partial writes to |
| /// vector.transfer_write. |
| struct LinalgCopyVTWForwardingPattern |
| : public OpRewritePattern<vector::TransferWriteOp> { |
| using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern; |
| |
| LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp, |
| PatternRewriter &rewriter) const override; |
| }; |
| |
| /// Rewrite extract_slice(tensor.pad(x)) into tensor.pad(extract_slice(x)). |
| struct ExtractSliceOfPadTensorSwapPattern |
| : public OpRewritePattern<tensor::ExtractSliceOp> { |
| /// A function to control pattern application and rewrite logic. |
| /// |
| /// The function will be given the slice op and should return: |
| /// - std::nullopt: to fail the match and not apply the pattern; |
| /// - true: to apply the pattern with zero slice guard; |
| /// - false: to apply the pattern without zero slice guard. |
| /// |
| /// See the documentation for tensor::bubbleUpPadSlice regarding zero slice |
| /// guard. |
| using ControlFn = std::function<std::optional<bool>(tensor::ExtractSliceOp)>; |
| |
| ExtractSliceOfPadTensorSwapPattern(MLIRContext *context, |
| ControlFn controlFn = nullptr, |
| PatternBenefit benefit = 1) |
| : OpRewritePattern(context, benefit), controlFn(std::move(controlFn)) {} |
| |
| LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp, |
| PatternRewriter &rewriter) const override; |
| |
| private: |
| ControlFn controlFn; |
| }; |
| |
| //===----------------------------------------------------------------------===// |
| // Populate functions. |
| //===----------------------------------------------------------------------===// |
| |
| /// Canonicalization patterns relevant to apply after tiling patterns. These |
| /// are applied automatically by the tiling pass but need to be applied |
| /// manually when tiling is called programmatically. |
| RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx); |
| void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns); |
| |
| /// Linalg generalization patterns |
| |
| /// Populates `patterns` with patterns to convert spec-generated named ops to |
| /// linalg.generic ops. |
| void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns); |
| |
| /// Linalg decompose convolutions patterns |
| |
| /// Populates patterns to decompose high-D convolution ops into low-D ones. |
| /// This is a step in progressive lowering for convolution ops, afterwards we |
| /// can vectorize the low-D convolution ops. |
| void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns, |
| PatternBenefit benefit = 1); |
| |
| /// Populates patterns to transform linalg.conv_2d_xxx operations into |
| /// linalg.generic (for img2col packing) and linalg.matmul. |
| /// \see rewriteInIm2Col for more details. |
| void populateConvertConv2DToImg2ColPatterns(RewritePatternSet &patterns); |
| |
| /// Populates `patterns` with patterns that vectorize tensor.pad. |
| /// These patterns are meant to apply in a complementary fashion. Benefits |
| /// are used to encode a certain ordering of pattern application. To avoid |
| /// scattering magic constants throughout the code base, the patterns must be |
| /// added with this function. `baseBenefit` can be used to offset the benefit |
| /// of all tensor::PadOp vectorization patterns by a certain value. |
| void populatePadOpVectorizationPatterns(RewritePatternSet &patterns, |
| PatternBenefit baseBenefit = 1); |
| |
| /// Populate patterns for splitting a `LinalgOp` with multiple statements within |
| /// its payload into multiple `GenericOp` that have a single statement. |
| /// The option `removeDeadArgsAndResults` adds patterns to remove dead arguments |
| /// and results from the generated decomposed ops. This is default `true` since |
| /// the core decomposition patterns relies on these clean up patterns. It is set |
| /// to false only for testing purposes. |
| void populateDecomposeLinalgOpsPattern(RewritePatternSet &patterns, |
| bool removeDeadArgsAndResults = true); |
| |
| /// Populate patterns that convert non-destination-style ops to destination |
| /// style ops. |
| void populateConvertToDestinationStylePatterns(RewritePatternSet &patterns); |
| |
| /// Populate patterns for vectorizing low-D convolution ops. This is a step in |
| /// progressive lowering for convolution ops, it assume high-D convolution ops |
| /// were decomposed previously. |
| void populateConvolutionVectorizationPatterns(RewritePatternSet &patterns, |
| PatternBenefit benefit = 1); |
| |
| /// Populate patterns that convert `ElementwiseMappable` ops to linalg |
| /// parallel loops. |
| void populateElementwiseToLinalgConversionPatterns(RewritePatternSet &patterns); |
| |
| /// Populate patterns that are only useful in the context of sparse tensors. |
| void populateSparseTensorRewriting(RewritePatternSet &patterns); |
| |
| /// Function type which is used to control when to stop fusion. It is expected |
| /// that OpOperand is not modified in the callback. The OpOperand is not marked |
| /// as const to allow callers to use non-const methods. |
| using ControlFusionFn = std::function<bool(OpOperand *fusedOperand)>; |
| |
| /// Patterns for fusing linalg operation on tensors. |
| |
| /// Pattern to fuse `linalg.generic` -> `linalg.generic` operations |
| /// when both operations are fusable elementwise operations. |
| void populateElementwiseOpsFusionPatterns( |
| RewritePatternSet &patterns, |
| const ControlFusionFn &controlElementwiseOpFusion); |
| |
| /// Function type which is used to control propagation of tensor.pack/unpack |
| /// ops. |
| using ControlPropagationFn = std::function<bool(Operation *op)>; |
| |
| /// Patterns to bubble up or down data layout ops across other operations. |
| void populateDataLayoutPropagationPatterns( |
| RewritePatternSet &patterns, |
| const ControlPropagationFn &controlPackUnPackPropagation); |
| |
| /// Pattern to remove dead operands and results of `linalg.generic` operations. |
| /// This is effectively DCE for a linalg op. |
| void populateEraseUnusedOperandsAndResultsPatterns(RewritePatternSet &patterns); |
| |
| /// Patterns to promote inputs to outputs and remove unused inputs of |
| /// `linalg.generic` ops. |
| void populateEraseUnnecessaryInputsPatterns(RewritePatternSet &patterns); |
| |
| /// Function type to control generic op dimension collapsing. It is expected |
| /// to return an array of `ReassociationIndices` representing dimensions that |
| /// should be merged. |
| using GetCollapsableDimensionsFn = |
| std::function<SmallVector<ReassociationIndices>(linalg::LinalgOp)>; |
| |
| /// Pattern to collapse dimensions in a linalg.generic op. This will collapse |
| /// tensor operands when needed and expand back the result tensors. |
| void populateCollapseDimensions( |
| RewritePatternSet &patterns, |
| const GetCollapsableDimensionsFn &controlCollapseDimensions); |
| |
| /// Patterns to fold an expanding (collapsing) tensor_reshape operation with its |
| /// producer (consumer) generic operation by expanding the dimensionality of the |
| /// loop in the generic op. |
| void populateFoldReshapeOpsByExpansionPatterns( |
| RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes); |
| |
| /// Patterns to fold an expanding tensor.expand_shape operation with its |
| /// producer generic operation by collapsing the dimensions of the generic op. |
| void populateFoldReshapeOpsByCollapsingPatterns( |
| RewritePatternSet &patterns, const ControlFusionFn &controlFoldingReshapes); |
| |
| /// Patterns to constant fold Linalg operations. |
| void populateConstantFoldLinalgOperations(RewritePatternSet &patterns, |
| const ControlFusionFn &controlFn); |
| |
| /// Pattern to fuse a `tensor.pad` operation with the producer of its source, |
| /// if the producer is a `linalg` operation with all parallel iterator types. |
| void populateFuseTensorPadWithProducerLinalgOpPatterns( |
| RewritePatternSet &patterns); |
| |
| /// Patterns to convert from one named op to another. These can be seen as |
| /// canonicalizations of named ops into another named op. |
| void populateLinalgNamedOpConversionPatterns(RewritePatternSet &patterns); |
| |
| /// Patterns to fold unit-extent dimensions in operands/results of linalg ops on |
| /// tensors via reassociative reshape ops. |
| void populateFoldUnitExtentDimsPatterns(RewritePatternSet &patterns, |
| ControlDropUnitDims &options); |
| |
| /// A pattern that converts init operands to input operands. |
| void populateMoveInitOperandsToInputPattern(RewritePatternSet &patterns); |
| |
| /// Patterns that are used to inline constant operands into linalg generic ops. |
| void populateInlineConstantOperandsPatterns(RewritePatternSet &patterns); |
| |
| /// Patterns that are used to bubble up extract slice op above linalg op. |
| void populateBubbleUpExtractSliceOpPatterns(RewritePatternSet &patterns); |
| |
| /// Adds patterns that waps tensor.extract_slice(linalg.fill(%cst, %init)) into |
| /// linalg.fill(%cst, tensor.extract_slice(%init)). |
| void populateSwapExtractSliceWithFillPatterns(RewritePatternSet &patterns); |
| |
| /// Patterns to apply `splitReduction` below. |
| void populateSplitReductionPattern( |
| RewritePatternSet &patterns, |
| const ControlSplitReductionFn &controlSplitReductionFn, |
| bool useAlloc = false); |
| |
| /// Patterns to convert Linalg matmul ops to transposed variants. |
| void populateTransposeMatmulPatterns(RewritePatternSet &patterns, |
| bool transposeLHS = true); |
| |
| } // namespace linalg |
| } // namespace mlir |
| |
| #endif // MLIR_DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H |