lib/Dialect/Linalg/Transforms/HoistPadding.cpp - llvm-project/mlir - Git at Google

 //===- HoistPadding.cpp - Hoisting transformation for PadTensorOp ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements functions concerned with hoisting padding operations.
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/SCF/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"

 using llvm::dbgs;

 #define DEBUG_TYPE "hoist-padding"

 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")

 using namespace mlir;
 using namespace mlir::linalg;

 /// Analysis class to support PadTensorOp hoisting across multiple enclosing
 /// loops. The failure conditions are:
 ///   1. Pad op has a use that is not an input of a LinalgOp.
 ///   2. There is no immediately enclosing scf::ForOp.
 ///   3. The backward slice from the pad op to the scf::ForOp to hoist above
 ///   contains
 ///      an unknown op with a region.
 ///   4. The backward slice from the pad op to the scf::ForOp to hoist above is
 ///   empty.
 /// Other cases succeed and will trigger hoisting of the pad op.
 struct HoistingAnalysis {
   HoistingAnalysis(PadTensorOp padTensorOp, int nLevels);

   bool isValid() { return valid; }

   /// Footprint of the packedTensor, computed from the packingLoops and
   /// `backwardSlice`.
   FailureOr<SmallVector<Value>> getPackedTensorSizes(ImplicitLocOpBuilder &b);

   /// The padTensorOp that needs to be hoisted.
   PadTensorOp padTensorOp;

   /// The maximum number of immediately enclosing scf::ForOp to hoist over.
   int nLevels;

   /// The outermost loop, determined by `nLevels` above which `padTensorOp` will
   /// be hoisted.
   scf::ForOp outermostEnclosingForOp;

   /// Backward slice rooted at `padTensorOp` and nested under
   /// `outermostEnclosingForOp`.
   SetVector<Operation *> backwardSlice;

   /// The scf::ForOp immediately enclosing `padTensorOp` such that:
   ///  1. they are nested under `outermostEnclosingForOp` (inclusive)
   ///  2. whose induction variable is used, directly or indirectly, in the
   ///     computation of `padTensorOp`.
   /// The span of these loops determines the footprint of the packed tensor.
   /// SmallSetVector<scf::ForOp> packingLoops;
   SetVector<scf::ForOp, SmallVector<scf::ForOp>, DenseSet<Operation *>>
       packingLoops;

 private:
   /// Encodes whether the analysis is valid and hoisting can proceed.
   bool valid;
 };

 /// Return true if all uses of `padTensorOp` are an input tensor of some
 /// LinalgOp.
 static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) {
   for (OpOperand &use : padTensorOp.result().getUses()) {
     auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
     if (!linalgUser || !linalgUser.isInputTensor(&use)) {
       LLVM_DEBUG(DBGS() << "Found a use of " << *(padTensorOp)
                         << "\nthat is not an input tensor of a LinalgOp, "
                         << "cannot hoist\n"
                         << *(use.getOwner()) << "\n");
       return false;
     }
   }
   return true;
 }

 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
 /// Stops at the first parent that is not an scf::ForOp.
 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
 /// Control-flow and other containing ops with regions are not modeled atm.
 static void
 getAtMostNEnclosingLoops(PadTensorOp padTensorOp, int nLevels,
                          SmallVector<scf::ForOp> &reverseEnclosingLoops) {
   AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
   (void)state;
   scf::ForOp outermostEnclosingForOp = nullptr;
   Operation *nextEnclosingOp = padTensorOp->getParentOp();
   while (nLevels-- > 0 &&
          (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
     LLVM_DEBUG(
         DBGS() << "loops: ";
         outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
         dbgs() << "\n");
     reverseEnclosingLoops.push_back(outermostEnclosingForOp);
     nextEnclosingOp = outermostEnclosingForOp->getParentOp();
   }
 }

 HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int nLevels)
     : padTensorOp(padTensorOp), nLevels(nLevels), valid(false) {
   AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
   (void)state;

   // Bail on any use that isn't an input of a Linalg op.
   // Hoisting of inplace updates happens after vectorization.
   if (!isOnlyUsedAsInputOfLinalgOp(padTensorOp))
     return;

   // Get at most nLevels of immediately enclosing loops.
   SmallVector<scf::ForOp> reverseEnclosingLoops;
   getAtMostNEnclosingLoops(padTensorOp, nLevels, reverseEnclosingLoops);
   if (reverseEnclosingLoops.empty()) {
     LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
     return;
   }

   outermostEnclosingForOp = reverseEnclosingLoops.back();

   // Get all the ops in the backwards slice starting from `padTensorOp` and that
   // are dominated by the outermost enclosing loop.
   // Bail on any op with a region that is not either a scf::ForOp or a LinalgOp.
   bool analysisFailure = false;
   DominanceInfo domInfo(outermostEnclosingForOp);
   getBackwardSlice(
       padTensorOp.getOperation(), &backwardSlice, [&](Operation *op) {
         if (!domInfo.dominates(outermostEnclosingForOp, op))
           return false;
         if (op != padTensorOp && op->getNumRegions() > 0 &&
             !isa<scf::ForOp, LinalgOp>(op)) {
           analysisFailure = true;
           LLVM_DEBUG(DBGS()
                      << "Unsupported op with region: " << *op << " -> skip\n");
           return false;
         }
         return true;
       });

   if (analysisFailure || backwardSlice.empty())
     return;

   // Backward slice is a topologically sorted list of ops starting at
   // `outermostEnclosingForOp`.
   assert(outermostEnclosingForOp == backwardSlice.front());

   // Filter out the loops whose induction variable is not used to compute the
   // padded result. As a first approximation, just look for IVs that have no use
   // in the backwardSlice.
   // These are the dimensions of reuse that we can exploit to reduce the amount
   // of copy / memory.
   for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {
     for (Operation *user : forOp.getInductionVar().getUsers()) {
       if (backwardSlice.contains(user)) {
         packingLoops.insert(forOp);
         break;
       }
     }
   }

   // The analysis is valid and hoisting can occur.
   valid = true;
 }

 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
   return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
 }

 /// For each loop in `loops`, determine the ops involved in the construction of
 /// its upper bound---up to the outerLimit loop--- and fold them as new
 /// inequalities in the constraint set.
 /// This is achieved by computing the backwardSlice of the loop's upper bound
 /// and iteratively folding each op in reverse topological order to guarantee
 /// use-def ordering.
 /// As operations are folded in, their result is projected out of the
 /// constraints set.
 /// The following operations are supported:
 ///   - scf::ForOp are simply skipped.
 ///   - AffineApplyOp are composed to replace the result by an equality.
 ///   - AffineMinOp are composed by adding each entry as an upper bound.
 /// If any other operation is met, return failure.
 // TODO: extend on a per-need basis.
 static LogicalResult
 foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints,
                                   scf::ForOp outerLimit,
                                   ArrayRef<scf::ForOp> loops) {
   SetVector<Value> toProjectOut;
   for (scf::ForOp loop : loops) {
     auto ub = loop.upperBound();
     if (isDefinedOutsideOrConstant(outerLimit, ub))
       continue;

     // Compute a backward slice up to, but not including, `outerLimit`.
     SetVector<Operation *> backwardSlice;
     getBackwardSlice(ub, &backwardSlice, [&](Operation *op) {
       return outerLimit->isProperAncestor(op);
     });
     backwardSlice.insert(ub.getDefiningOp());

     // Iterate over all ops in the slice and compose them in the constraints.
     for (Operation *op : llvm::reverse(backwardSlice)) {
       if (!isa<scf::ForOp, AffineApplyOp, AffineMinOp>(op))
         return failure();
       if (isa<scf::ForOp>(op))
         continue;
       // Ensure there is a
       auto ensureIdFailed = [&](Value v) {
         if (constraints.containsId(v)) {
           unsigned pos;
           constraints.findId(v, &pos);
           return pos >= constraints.getNumDimIds();
         }
         constraints.appendDimId(v);
         return false;
       };

       // Ensure all ids exist and add results for later projection.
       if (llvm::any_of(op->getResults(), ensureIdFailed) ||
           llvm::any_of(op->getOperands(), ensureIdFailed))
         return failure();

       // All supported ops have 1 result.
       // TODO: extend when needed.
       toProjectOut.insert(op->getResult(0));

       // Compose supported ops.
       if (auto affineApplyOp = dyn_cast<AffineApplyOp>(op)) {
         AffineValueMap avm(affineApplyOp.getAffineMap(),
                            affineApplyOp.getOperands(),
                            affineApplyOp.getResult());
         if (failed(constraints.composeMap(&avm)))
           return failure();
         continue;
       }
       auto affineMinOp = cast<AffineMinOp>(op);
       unsigned pos;
       bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos);
       (void)foundMinOp;
       assert(foundMinOp);
       AffineMap alignedMap = constraints.computeAlignedMap(
           affineMinOp.getAffineMap(), affineMinOp.getOperands());
       if (failed(
               constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap)))
         return failure();
     }
   }
   for (Value v : toProjectOut)
     constraints.projectOut(v);
   return success();
 }

 // Footprint of the packedTensor, computed from the packingLoops and
 // `backwardSlice`.
 FailureOr<SmallVector<Value>>
 HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) {
   // Create the base affine constaints for the packedLoops.
   auto constraints = FlatAffineValueConstraints::getHyperrectangular(
       llvm::to_vector<8>(llvm::map_range(
           packingLoops, [](scf::ForOp op) { return op.getInductionVar(); })),
       llvm::to_vector<8>(llvm::map_range(
           packingLoops, [](scf::ForOp op) { return op.lowerBound(); })),
       llvm::to_vector<8>(llvm::map_range(
           packingLoops, [](scf::ForOp op) { return op.upperBound(); })));

   // Iteratively try to fold the upper bounds into the constraints set.
   if (failed(foldUpperBoundsIntoConstraintsSet(
           constraints, outermostEnclosingForOp, packingLoops.getArrayRef())))
     return failure();

   int nPackedLoops = packingLoops.size();
   SmallVector<AffineMap> lbs(nPackedLoops), ubs(nPackedLoops);
   // Compute the bounds of the first positions, assuming the others are fixed.
   constraints.getSliceBounds(/*pos=*/0, /*num=*/nPackedLoops,
                              outermostEnclosingForOp->getContext(), &lbs, &ubs);

   SmallVector<Value> allValues;
   constraints.getAllValues(&allValues);
   SmallVector<Value> allNonLoopValues(allValues.begin() + nPackedLoops,
                                       allValues.end());

   // For each packingLoop, create the extent by (ub - lb).ceilDiv(step).
   // IP just before the outermost loop considered that we hoist above.
   assert(nPackedLoops == static_cast<int64_t>(lbs.size()) &&
          "expected matching lb sizes");
   assert(nPackedLoops == static_cast<int64_t>(ubs.size()) &&
          "expected matching ub sizes");
   SmallVector<Value> dynamicTensorSizes;
   for (auto it : llvm::zip(packingLoops, lbs, ubs)) {
     scf::ForOp loop = std::get<0>(it);
     AffineMap lbMap = std::get<1>(it);
     AffineMap ubMap = std::get<2>(it);
     SmallVector<Value> lbOperands(allNonLoopValues);
     canonicalizeMapAndOperands(&lbMap, &lbOperands);
     Value lbVal = b.createOrFold<AffineMaxOp>(lbMap, lbOperands);

     SmallVector<Value> ubOperands(allNonLoopValues);
     canonicalizeMapAndOperands(&ubMap, &ubOperands);
     Value ubVal = b.createOrFold<AffineMinOp>(ubMap, ubOperands);

     AffineExpr lb, ub, step;
     bindDims(b.getContext(), lb, ub);
     bindSymbols(b.getContext(), step);
     Value res = b.createOrFold<AffineApplyOp>(
         (ub - lb).ceilDiv(step),
         ValueRange{lbVal, ubVal, cast<scf::ForOp>(loop).step()});

     dynamicTensorSizes.push_back(res);
   }
   return dynamicTensorSizes;
 }

 /// Return success if `v` is a value that is only transitively defined by ops of
 /// type in `OpTypeList`.
 template <typename... OpTypeList>
 static bool backwardsSliceOnlyHasOpsOfType(scf::ForOp outerLimit, Value v) {
   // Compute a backward slice up to, but not including, `outerLimit`.
   SetVector<Operation *> backwardSlice;
   getBackwardSlice(v, &backwardSlice, [&](Operation *op) {
     return outerLimit->isProperAncestor(op);
   });
   // Traverse the backward slice and ensure we can perform the computation to
   // hoist.
   for (Operation *op : backwardSlice) {
     if (isa<OpTypeList...>(op))
       continue;
     LLVM_DEBUG(DBGS() << "Abort: unadmissible op in slice " << *op << "\n");
     return false;
   }
   return true;
 }

 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
 /// The returned Value is guaranteed not to depend on any loop comprised in
 /// [`outer`, `forOp`].
 /// Return null if such a loop-independent quantity cannot be computed.
 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
                                      scf::ForOp forOp) {
   MLIRContext *ctx = forOp->getContext();
   AffineExpr iv, lb, step;
   bindDims(ctx, iv, lb);
   bindSymbols(ctx, step);
   if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) ||
       !isDefinedOutsideOrConstant(outer, forOp.step()))
     return Value();
   Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(),
         stepVal = forOp.step();
   auto loc = forOp->getLoc();
   return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
                                        ValueRange{ivVal, lbVal, stepVal});
 }

 LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
                                                   int nLoops) {
   LLVM_DEBUG(DBGS() << "Try to hoist " << *(padTensorOp) << " by " << nLoops
                     << " loops\n");
   HoistingAnalysis analysis(padTensorOp, nLoops);
   if (!analysis.isValid()) {
     LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");
     return failure();
   }

   scf::ForOp outer = analysis.outermostEnclosingForOp;
   ImplicitLocOpBuilder b(outer->getLoc(), outer);

   auto maybeDynamicTensorSizes = analysis.getPackedTensorSizes(b);
   if (failed(maybeDynamicTensorSizes))
     return failure();
   SmallVector<Value> dynamicTensorSizes = *maybeDynamicTensorSizes;

   // Update actual number of loops, which may be smaller.
   int nPackedLoops = analysis.packingLoops.size();

   Location loc = padTensorOp->getLoc();
   RankedTensorType paddedTensorType = padTensorOp.getResultType();
   int paddedRank = paddedTensorType.getRank();

   // Create the packed tensor<?x?x..?xpadded_shape> into which we amortize
   // padding.
   SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);
   // TODO: go grab dims when necessary, for now PadTensorOp returns a static
   // tensor.
   llvm::append_range(packedShape, paddedTensorType.getShape());
   auto packedTensorType =
       RankedTensorType::get(packedShape, paddedTensorType.getElementType());
   Value packedTensor = b.create<linalg::InitTensorOp>(
       loc, dynamicTensorSizes, packedTensorType.getShape(),
       packedTensorType.getElementType());

   // Clone the operations involved in the backward slice, iteratively stepping
   // into the loops that we encounter.
   // The implementation proceeds in a stack-like fashion:
   //   1. Iteratively clone and step into the loops, pushing the `packedTensor`
   //      deeper in the stack.
   //   2. Create a InsertSliceOp at the top of the stack.
   //   3. Iteratively pop and yield the result of the InsertSliceOp across
   //     the cloned loops.
   SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
   clonedLoopIvs.reserve(nPackedLoops);
   leadingPackedTensorIndexings.reserve(nPackedLoops);
   BlockAndValueMapping bvm;
   // Insert `padTensorOp` into the backwardSlice so we clone it too.
   analysis.backwardSlice.insert(padTensorOp);
   // Stack step 1. iteratively clone loops and push `packedTensor`.
   for (Operation *op : analysis.backwardSlice) {
     // Specifically sit out in the extract_slice(packedTensor) case: this is the
     // piece we seek to replace.
     if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
       if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor)
         continue;
     auto effects = dyn_cast<MemoryEffectOpInterface>(op);
     bool hasNoEffects = !effects || effects.hasNoEffect();
     if (hasNoEffects &&
         (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op))) {
       b.clone(*op, bvm);
       continue;
     }
     // TODO: support more cases as they appear.
     auto forOp = dyn_cast<scf::ForOp>(op);
     assert(forOp && "Expected scf::ForOp when hoisting pad ops");
     // Unused loop, just skip it.
     if (!analysis.packingLoops.contains(forOp))
       continue;

     auto clonedForOp =
         b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
                              bvm.lookupOrDefault(forOp.upperBound()),
                              bvm.lookupOrDefault(forOp.step()), packedTensor);
     // Map the induction var, region args and results to the `clonedForOp`.
     bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
     bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
     bvm.map(forOp.getResults(), clonedForOp.getResults());
     assert(clonedForOp->getNumRegions() == 1);
     clonedLoopIvs.push_back(clonedForOp.getInductionVar());

     b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
     Value loopIndependentIterationCount =
         buildLoopIterationCount(b, outer, clonedForOp);
     // Assert the loop-independent iteration count can be computed.
     if (!loopIndependentIterationCount)
       llvm_unreachable("loop independence prerequisite not met");
     leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
     packedTensor = clonedForOp.getRegionIterArgs().front();
   }

   // Stack step 2. create InsertSliceOp at the top of the stack.
   // offsets = [clonedLoopIvs, 0 .. 0].
   SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
                                     leadingPackedTensorIndexings.end());
   offsets.append(paddedRank, b.getIndexAttr(0));
   // sizes = [1 .. 1, paddedShape].
   SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
   for (int64_t sz : paddedTensorType.getShape()) {
     // TODO: go grab dims when necessary, for now PadTensorOp returns a static
     // tensor.
     assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
     sizes.push_back(b.getIndexAttr(sz));
   }
   // strides = [1 .. 1].
   SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,
                                     b.getIndexAttr(1));

   Value inserted =
       b.create<tensor::InsertSliceOp>(loc, bvm.lookup(padTensorOp.result()),
                                       packedTensor, offsets, sizes, strides);

   // Stack step 3. iteratively pop the stack and propagate the yield.
   Value valueToYield = inserted;
   for (Value iv : llvm::reverse(clonedLoopIvs)) {
     auto forOp = scf::getForInductionVarOwner(iv);
     b.setInsertionPointToEnd(&forOp.getRegion().front());
     b.create<scf::YieldOp>(loc, valueToYield);
     valueToYield = forOp.getResult(0);
   }

   // Now the packed tensor is ready, replace the original padding op by a
   // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
   b.setInsertionPoint(padTensorOp);
   SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(
       llvm::map_range(analysis.packingLoops, [&](Operation *loop) {
         return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
       }));
   // Assert all loop iteration counts can be computed.
   if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
     llvm_unreachable("loop independence prerequisite not met");
   // offsets = [originalLoopIvs, 0 .. 0].
   offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
   offsets.append(paddedRank, b.getIndexAttr(0));
   // sizes = [1 .. 1, paddedShape] (definedabove).
   // strides = [1 .. 1] (defined above)
   packedTensor =
       scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
   padTensorOp.replaceAllUsesWith(
       b.create<tensor::ExtractSliceOp>(loc, padTensorOp.getResultType(),
                                        packedTensor, offsets, sizes, strides)
           ->getResult(0));

   Operation *toErase = padTensorOp;

   // Make the newly cloned `padTensorOp` available to the caller.
   padTensorOp =
       cast<PadTensorOp>(bvm.lookup(padTensorOp.result()).getDefiningOp());

   toErase->erase();

   return success();
 }
	//===- HoistPadding.cpp - Hoisting transformation for PadTensorOp ---------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements functions concerned with hoisting padding operations.
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
	#include "mlir/Analysis/AffineStructures.h"
	#include "mlir/Analysis/SliceAnalysis.h"
	#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
	#include "mlir/Dialect/Affine/Utils.h"
	#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
	#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
	#include "mlir/Dialect/SCF/SCF.h"
	#include "mlir/Dialect/SCF/Utils.h"
	#include "mlir/Dialect/StandardOps/IR/Ops.h"
	#include "mlir/Dialect/Tensor/IR/Tensor.h"
	#include "mlir/Dialect/Vector/VectorOps.h"
	#include "mlir/Dialect/Vector/VectorUtils.h"
	#include "mlir/IR/AsmState.h"
	#include "mlir/IR/BuiltinOps.h"
	#include "mlir/IR/Dominance.h"
	#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
	#include "mlir/Transforms/LoopUtils.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/Support/Debug.h"

	using llvm::dbgs;

	#define DEBUG_TYPE "hoist-padding"

	#define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")

	using namespace mlir;
	using namespace mlir::linalg;

	/// Analysis class to support PadTensorOp hoisting across multiple enclosing
	/// loops. The failure conditions are:
	/// 1. Pad op has a use that is not an input of a LinalgOp.
	/// 2. There is no immediately enclosing scf::ForOp.
	/// 3. The backward slice from the pad op to the scf::ForOp to hoist above
	/// contains
	/// an unknown op with a region.
	/// 4. The backward slice from the pad op to the scf::ForOp to hoist above is
	/// empty.
	/// Other cases succeed and will trigger hoisting of the pad op.
	struct HoistingAnalysis {
	HoistingAnalysis(PadTensorOp padTensorOp, int nLevels);

	bool isValid() { return valid; }

	/// Footprint of the packedTensor, computed from the packingLoops and
	/// `backwardSlice`.
	FailureOr<SmallVector<Value>> getPackedTensorSizes(ImplicitLocOpBuilder &b);

	/// The padTensorOp that needs to be hoisted.
	PadTensorOp padTensorOp;

	/// The maximum number of immediately enclosing scf::ForOp to hoist over.
	int nLevels;

	/// The outermost loop, determined by `nLevels` above which `padTensorOp` will
	/// be hoisted.
	scf::ForOp outermostEnclosingForOp;

	/// Backward slice rooted at `padTensorOp` and nested under
	/// `outermostEnclosingForOp`.
	SetVector<Operation *> backwardSlice;

	/// The scf::ForOp immediately enclosing `padTensorOp` such that:
	/// 1. they are nested under `outermostEnclosingForOp` (inclusive)
	/// 2. whose induction variable is used, directly or indirectly, in the
	/// computation of `padTensorOp`.
	/// The span of these loops determines the footprint of the packed tensor.
	/// SmallSetVector<scf::ForOp> packingLoops;
	SetVector<scf::ForOp, SmallVector<scf::ForOp>, DenseSet<Operation *>>
	packingLoops;

	private:
	/// Encodes whether the analysis is valid and hoisting can proceed.
	bool valid;
	};

	/// Return true if all uses of `padTensorOp` are an input tensor of some
	/// LinalgOp.
	static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) {
	for (OpOperand &use : padTensorOp.result().getUses()) {
	auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
	if (!linalgUser \|\| !linalgUser.isInputTensor(&use)) {
	LLVM_DEBUG(DBGS() << "Found a use of " << *(padTensorOp)
	<< "\nthat is not an input tensor of a LinalgOp, "
	<< "cannot hoist\n"
	<< *(use.getOwner()) << "\n");
	return false;
	}
	}
	return true;
	}

	/// Return at most nLevels of immediately enclosing scf::ForOp loops.
	/// Stops at the first parent that is not an scf::ForOp.
	/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
	/// Control-flow and other containing ops with regions are not modeled atm.
	static void
	getAtMostNEnclosingLoops(PadTensorOp padTensorOp, int nLevels,
	SmallVector<scf::ForOp> &reverseEnclosingLoops) {
	AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
	(void)state;
	scf::ForOp outermostEnclosingForOp = nullptr;
	Operation *nextEnclosingOp = padTensorOp->getParentOp();
	while (nLevels-- > 0 &&
	(outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
	LLVM_DEBUG(
	DBGS() << "loops: ";
	outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
	dbgs() << "\n");
	reverseEnclosingLoops.push_back(outermostEnclosingForOp);
	nextEnclosingOp = outermostEnclosingForOp->getParentOp();
	}
	}

	HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int nLevels)
	: padTensorOp(padTensorOp), nLevels(nLevels), valid(false) {
	AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
	(void)state;

	// Bail on any use that isn't an input of a Linalg op.
	// Hoisting of inplace updates happens after vectorization.
	if (!isOnlyUsedAsInputOfLinalgOp(padTensorOp))
	return;

	// Get at most nLevels of immediately enclosing loops.
	SmallVector<scf::ForOp> reverseEnclosingLoops;
	getAtMostNEnclosingLoops(padTensorOp, nLevels, reverseEnclosingLoops);
	if (reverseEnclosingLoops.empty()) {
	LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
	return;
	}

	outermostEnclosingForOp = reverseEnclosingLoops.back();

	// Get all the ops in the backwards slice starting from `padTensorOp` and that
	// are dominated by the outermost enclosing loop.
	// Bail on any op with a region that is not either a scf::ForOp or a LinalgOp.
	bool analysisFailure = false;
	DominanceInfo domInfo(outermostEnclosingForOp);
	getBackwardSlice(
	padTensorOp.getOperation(), &backwardSlice, [&](Operation *op) {
	if (!domInfo.dominates(outermostEnclosingForOp, op))
	return false;
	if (op != padTensorOp && op->getNumRegions() > 0 &&
	!isa<scf::ForOp, LinalgOp>(op)) {
	analysisFailure = true;
	LLVM_DEBUG(DBGS()
	<< "Unsupported op with region: " << *op << " -> skip\n");
	return false;
	}
	return true;
	});

	if (analysisFailure \|\| backwardSlice.empty())
	return;

	// Backward slice is a topologically sorted list of ops starting at
	// `outermostEnclosingForOp`.
	assert(outermostEnclosingForOp == backwardSlice.front());

	// Filter out the loops whose induction variable is not used to compute the
	// padded result. As a first approximation, just look for IVs that have no use
	// in the backwardSlice.
	// These are the dimensions of reuse that we can exploit to reduce the amount
	// of copy / memory.
	for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {
	for (Operation *user : forOp.getInductionVar().getUsers()) {
	if (backwardSlice.contains(user)) {
	packingLoops.insert(forOp);
	break;
	}
	}
	}

	// The analysis is valid and hoisting can occur.
	valid = true;
	}

	static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
	return outer.isDefinedOutsideOfLoop(v) \|\| v.getDefiningOp<ConstantOp>();
	}

	/// For each loop in `loops`, determine the ops involved in the construction of
	/// its upper bound---up to the outerLimit loop--- and fold them as new
	/// inequalities in the constraint set.
	/// This is achieved by computing the backwardSlice of the loop's upper bound
	/// and iteratively folding each op in reverse topological order to guarantee
	/// use-def ordering.
	/// As operations are folded in, their result is projected out of the
	/// constraints set.
	/// The following operations are supported:
	/// - scf::ForOp are simply skipped.
	/// - AffineApplyOp are composed to replace the result by an equality.
	/// - AffineMinOp are composed by adding each entry as an upper bound.
	/// If any other operation is met, return failure.
	// TODO: extend on a per-need basis.
	static LogicalResult
	foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints,
	scf::ForOp outerLimit,
	ArrayRef<scf::ForOp> loops) {
	SetVector<Value> toProjectOut;
	for (scf::ForOp loop : loops) {
	auto ub = loop.upperBound();
	if (isDefinedOutsideOrConstant(outerLimit, ub))
	continue;

	// Compute a backward slice up to, but not including, `outerLimit`.
	SetVector<Operation *> backwardSlice;
	getBackwardSlice(ub, &backwardSlice, [&](Operation *op) {
	return outerLimit->isProperAncestor(op);
	});
	backwardSlice.insert(ub.getDefiningOp());

	// Iterate over all ops in the slice and compose them in the constraints.
	for (Operation *op : llvm::reverse(backwardSlice)) {
	if (!isa<scf::ForOp, AffineApplyOp, AffineMinOp>(op))
	return failure();
	if (isa<scf::ForOp>(op))
	continue;
	// Ensure there is a
	auto ensureIdFailed = [&](Value v) {
	if (constraints.containsId(v)) {
	unsigned pos;
	constraints.findId(v, &pos);
	return pos >= constraints.getNumDimIds();
	}
	constraints.appendDimId(v);
	return false;
	};

	// Ensure all ids exist and add results for later projection.
	if (llvm::any_of(op->getResults(), ensureIdFailed) \|\|
	llvm::any_of(op->getOperands(), ensureIdFailed))
	return failure();

	// All supported ops have 1 result.
	// TODO: extend when needed.
	toProjectOut.insert(op->getResult(0));

	// Compose supported ops.
	if (auto affineApplyOp = dyn_cast<AffineApplyOp>(op)) {
	AffineValueMap avm(affineApplyOp.getAffineMap(),
	affineApplyOp.getOperands(),
	affineApplyOp.getResult());
	if (failed(constraints.composeMap(&avm)))
	return failure();
	continue;
	}
	auto affineMinOp = cast<AffineMinOp>(op);
	unsigned pos;
	bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos);
	(void)foundMinOp;
	assert(foundMinOp);
	AffineMap alignedMap = constraints.computeAlignedMap(
	affineMinOp.getAffineMap(), affineMinOp.getOperands());
	if (failed(
	constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap)))
	return failure();
	}
	}
	for (Value v : toProjectOut)
	constraints.projectOut(v);
	return success();
	}

	// Footprint of the packedTensor, computed from the packingLoops and
	// `backwardSlice`.
	FailureOr<SmallVector<Value>>
	HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) {
	// Create the base affine constaints for the packedLoops.
	auto constraints = FlatAffineValueConstraints::getHyperrectangular(
	llvm::to_vector<8>(llvm::map_range(
	packingLoops, [](scf::ForOp op) { return op.getInductionVar(); })),
	llvm::to_vector<8>(llvm::map_range(
	packingLoops, [](scf::ForOp op) { return op.lowerBound(); })),
	llvm::to_vector<8>(llvm::map_range(
	packingLoops, [](scf::ForOp op) { return op.upperBound(); })));

	// Iteratively try to fold the upper bounds into the constraints set.
	if (failed(foldUpperBoundsIntoConstraintsSet(
	constraints, outermostEnclosingForOp, packingLoops.getArrayRef())))
	return failure();

	int nPackedLoops = packingLoops.size();
	SmallVector<AffineMap> lbs(nPackedLoops), ubs(nPackedLoops);
	// Compute the bounds of the first positions, assuming the others are fixed.
	constraints.getSliceBounds(/pos=/0, /num=/nPackedLoops,
	outermostEnclosingForOp->getContext(), &lbs, &ubs);

	SmallVector<Value> allValues;
	constraints.getAllValues(&allValues);
	SmallVector<Value> allNonLoopValues(allValues.begin() + nPackedLoops,
	allValues.end());

	// For each packingLoop, create the extent by (ub - lb).ceilDiv(step).
	// IP just before the outermost loop considered that we hoist above.
	assert(nPackedLoops == static_cast<int64_t>(lbs.size()) &&
	"expected matching lb sizes");
	assert(nPackedLoops == static_cast<int64_t>(ubs.size()) &&
	"expected matching ub sizes");
	SmallVector<Value> dynamicTensorSizes;
	for (auto it : llvm::zip(packingLoops, lbs, ubs)) {
	scf::ForOp loop = std::get<0>(it);
	AffineMap lbMap = std::get<1>(it);
	AffineMap ubMap = std::get<2>(it);
	SmallVector<Value> lbOperands(allNonLoopValues);
	canonicalizeMapAndOperands(&lbMap, &lbOperands);
	Value lbVal = b.createOrFold<AffineMaxOp>(lbMap, lbOperands);

	SmallVector<Value> ubOperands(allNonLoopValues);
	canonicalizeMapAndOperands(&ubMap, &ubOperands);
	Value ubVal = b.createOrFold<AffineMinOp>(ubMap, ubOperands);

	AffineExpr lb, ub, step;
	bindDims(b.getContext(), lb, ub);
	bindSymbols(b.getContext(), step);
	Value res = b.createOrFold<AffineApplyOp>(
	(ub - lb).ceilDiv(step),
	ValueRange{lbVal, ubVal, cast<scf::ForOp>(loop).step()});

	dynamicTensorSizes.push_back(res);
	}
	return dynamicTensorSizes;
	}

	/// Return success if `v` is a value that is only transitively defined by ops of
	/// type in `OpTypeList`.
	template <typename... OpTypeList>
	static bool backwardsSliceOnlyHasOpsOfType(scf::ForOp outerLimit, Value v) {
	// Compute a backward slice up to, but not including, `outerLimit`.
	SetVector<Operation *> backwardSlice;
	getBackwardSlice(v, &backwardSlice, [&](Operation *op) {
	return outerLimit->isProperAncestor(op);
	});
	// Traverse the backward slice and ensure we can perform the computation to
	// hoist.
	for (Operation *op : backwardSlice) {
	if (isa<OpTypeList...>(op))
	continue;
	LLVM_DEBUG(DBGS() << "Abort: unadmissible op in slice " << *op << "\n");
	return false;
	}
	return true;
	}

	/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
	/// The returned Value is guaranteed not to depend on any loop comprised in
	/// [`outer`, `forOp`].
	/// Return null if such a loop-independent quantity cannot be computed.
	static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
	scf::ForOp forOp) {
	MLIRContext *ctx = forOp->getContext();
	AffineExpr iv, lb, step;
	bindDims(ctx, iv, lb);
	bindSymbols(ctx, step);
	if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) \|\|
	!isDefinedOutsideOrConstant(outer, forOp.step()))
	return Value();
	Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(),
	stepVal = forOp.step();
	auto loc = forOp->getLoc();
	return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
	ValueRange{ivVal, lbVal, stepVal});
	}

	LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
	int nLoops) {
	LLVM_DEBUG(DBGS() << "Try to hoist " << *(padTensorOp) << " by " << nLoops
	<< " loops\n");
	HoistingAnalysis analysis(padTensorOp, nLoops);
	if (!analysis.isValid()) {
	LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");
	return failure();
	}

	scf::ForOp outer = analysis.outermostEnclosingForOp;
	ImplicitLocOpBuilder b(outer->getLoc(), outer);

	auto maybeDynamicTensorSizes = analysis.getPackedTensorSizes(b);
	if (failed(maybeDynamicTensorSizes))
	return failure();
	SmallVector<Value> dynamicTensorSizes = *maybeDynamicTensorSizes;

	// Update actual number of loops, which may be smaller.
	int nPackedLoops = analysis.packingLoops.size();

	Location loc = padTensorOp->getLoc();
	RankedTensorType paddedTensorType = padTensorOp.getResultType();
	int paddedRank = paddedTensorType.getRank();

	// Create the packed tensor<?x?x..?xpadded_shape> into which we amortize
	// padding.
	SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);
	// TODO: go grab dims when necessary, for now PadTensorOp returns a static
	// tensor.
	llvm::append_range(packedShape, paddedTensorType.getShape());
	auto packedTensorType =
	RankedTensorType::get(packedShape, paddedTensorType.getElementType());
	Value packedTensor = b.create<linalg::InitTensorOp>(
	loc, dynamicTensorSizes, packedTensorType.getShape(),
	packedTensorType.getElementType());

	// Clone the operations involved in the backward slice, iteratively stepping
	// into the loops that we encounter.
	// The implementation proceeds in a stack-like fashion:
	// 1. Iteratively clone and step into the loops, pushing the `packedTensor`
	// deeper in the stack.
	// 2. Create a InsertSliceOp at the top of the stack.
	// 3. Iteratively pop and yield the result of the InsertSliceOp across
	// the cloned loops.
	SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
	clonedLoopIvs.reserve(nPackedLoops);
	leadingPackedTensorIndexings.reserve(nPackedLoops);
	BlockAndValueMapping bvm;
	// Insert `padTensorOp` into the backwardSlice so we clone it too.
	analysis.backwardSlice.insert(padTensorOp);
	// Stack step 1. iteratively clone loops and push `packedTensor`.
	for (Operation *op : analysis.backwardSlice) {
	// Specifically sit out in the extract_slice(packedTensor) case: this is the
	// piece we seek to replace.
	if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
	if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor)
	continue;
	auto effects = dyn_cast<MemoryEffectOpInterface>(op);
	bool hasNoEffects = !effects \|\| effects.hasNoEffect();
	if (hasNoEffects &&
	(op->getNumRegions() == 0 \|\| isa<linalg::PadTensorOp>(op))) {
	b.clone(*op, bvm);
	continue;
	}
	// TODO: support more cases as they appear.
	auto forOp = dyn_cast<scf::ForOp>(op);
	assert(forOp && "Expected scf::ForOp when hoisting pad ops");
	// Unused loop, just skip it.
	if (!analysis.packingLoops.contains(forOp))
	continue;

	auto clonedForOp =
	b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
	bvm.lookupOrDefault(forOp.upperBound()),
	bvm.lookupOrDefault(forOp.step()), packedTensor);
	// Map the induction var, region args and results to the `clonedForOp`.
	bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
	bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
	bvm.map(forOp.getResults(), clonedForOp.getResults());
	assert(clonedForOp->getNumRegions() == 1);
	clonedLoopIvs.push_back(clonedForOp.getInductionVar());

	b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
	Value loopIndependentIterationCount =
	buildLoopIterationCount(b, outer, clonedForOp);
	// Assert the loop-independent iteration count can be computed.
	if (!loopIndependentIterationCount)
	llvm_unreachable("loop independence prerequisite not met");
	leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
	packedTensor = clonedForOp.getRegionIterArgs().front();
	}

	// Stack step 2. create InsertSliceOp at the top of the stack.
	// offsets = [clonedLoopIvs, 0 .. 0].
	SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
	leadingPackedTensorIndexings.end());
	offsets.append(paddedRank, b.getIndexAttr(0));
	// sizes = [1 .. 1, paddedShape].
	SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
	for (int64_t sz : paddedTensorType.getShape()) {
	// TODO: go grab dims when necessary, for now PadTensorOp returns a static
	// tensor.
	assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
	sizes.push_back(b.getIndexAttr(sz));
	}
	// strides = [1 .. 1].
	SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,
	b.getIndexAttr(1));

	Value inserted =
	b.create<tensor::InsertSliceOp>(loc, bvm.lookup(padTensorOp.result()),
	packedTensor, offsets, sizes, strides);

	// Stack step 3. iteratively pop the stack and propagate the yield.
	Value valueToYield = inserted;
	for (Value iv : llvm::reverse(clonedLoopIvs)) {
	auto forOp = scf::getForInductionVarOwner(iv);
	b.setInsertionPointToEnd(&forOp.getRegion().front());
	b.create<scf::YieldOp>(loc, valueToYield);
	valueToYield = forOp.getResult(0);
	}

	// Now the packed tensor is ready, replace the original padding op by a
	// 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
	b.setInsertionPoint(padTensorOp);
	SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(
	llvm::map_range(analysis.packingLoops, [&](Operation *loop) {
	return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
	}));
	// Assert all loop iteration counts can be computed.
	if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
	llvm_unreachable("loop independence prerequisite not met");
	// offsets = [originalLoopIvs, 0 .. 0].
	offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
	offsets.append(paddedRank, b.getIndexAttr(0));
	// sizes = [1 .. 1, paddedShape] (definedabove).
	// strides = [1 .. 1] (defined above)
	packedTensor =
	scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
	padTensorOp.replaceAllUsesWith(
	b.create<tensor::ExtractSliceOp>(loc, padTensorOp.getResultType(),
	packedTensor, offsets, sizes, strides)
	->getResult(0));

	Operation *toErase = padTensorOp;

	// Make the newly cloned `padTensorOp` available to the caller.
	padTensorOp =
	cast<PadTensorOp>(bvm.lookup(padTensorOp.result()).getDefiningOp());

	toErase->erase();

	return success();
	}