mlir/lib/Analysis/Utils.cpp - llvm-project - Git at Google

 //===- Utils.cpp ---- Misc utilities for analysis -------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements miscellaneous analysis routines for non-loop IR
 // structures.
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Analysis/Utils.h"
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/PresburgerSet.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/IntegerSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"

 #define DEBUG_TYPE "analysis-utils"

 using namespace mlir;

 using llvm::SmallDenseMap;

 /// Populates 'loops' with IVs of the loops surrounding 'op' ordered from
 /// the outermost 'affine.for' operation to the innermost one.
 void mlir::getLoopIVs(Operation &op, SmallVectorImpl<AffineForOp> *loops) {
   auto *currOp = op.getParentOp();
   AffineForOp currAffineForOp;
   // Traverse up the hierarchy collecting all 'affine.for' operation while
   // skipping over 'affine.if' operations.
   while (currOp) {
     if (AffineForOp currAffineForOp = dyn_cast<AffineForOp>(currOp))
       loops->push_back(currAffineForOp);
     currOp = currOp->getParentOp();
   }
   std::reverse(loops->begin(), loops->end());
 }

 /// Populates 'ops' with IVs of the loops surrounding `op`, along with
 /// `affine.if` operations interleaved between these loops, ordered from the
 /// outermost `affine.for` operation to the innermost one.
 void mlir::getEnclosingAffineForAndIfOps(Operation &op,
                                          SmallVectorImpl<Operation *> *ops) {
   ops->clear();
   Operation *currOp = op.getParentOp();

   // Traverse up the hierarchy collecting all `affine.for` and `affine.if`
   // operations.
   while (currOp) {
     if (isa<AffineIfOp, AffineForOp>(currOp))
       ops->push_back(currOp);
     currOp = currOp->getParentOp();
   }
   std::reverse(ops->begin(), ops->end());
 }

 // Populates 'cst' with FlatAffineValueConstraints which represent original
 // domain of the loop bounds that define 'ivs'.
 LogicalResult
 ComputationSliceState::getSourceAsConstraints(FlatAffineValueConstraints &cst) {
   assert(!ivs.empty() && "Cannot have a slice without its IVs");
   cst.reset(/*numDims=*/ivs.size(), /*numSymbols=*/0, /*numLocals=*/0, ivs);
   for (Value iv : ivs) {
     AffineForOp loop = getForInductionVarOwner(iv);
     assert(loop && "Expected affine for");
     if (failed(cst.addAffineForOpDomain(loop)))
       return failure();
   }
   return success();
 }

 // Populates 'cst' with FlatAffineValueConstraints which represent slice bounds.
 LogicalResult
 ComputationSliceState::getAsConstraints(FlatAffineValueConstraints *cst) {
   assert(!lbOperands.empty());
   // Adds src 'ivs' as dimension identifiers in 'cst'.
   unsigned numDims = ivs.size();
   // Adds operands (dst ivs and symbols) as symbols in 'cst'.
   unsigned numSymbols = lbOperands[0].size();

   SmallVector<Value, 4> values(ivs);
   // Append 'ivs' then 'operands' to 'values'.
   values.append(lbOperands[0].begin(), lbOperands[0].end());
   cst->reset(numDims, numSymbols, 0, values);

   // Add loop bound constraints for values which are loop IVs of the destination
   // of fusion and equality constraints for symbols which are constants.
   for (unsigned i = numDims, end = values.size(); i < end; ++i) {
     Value value = values[i];
     assert(cst->containsId(value) && "value expected to be present");
     if (isValidSymbol(value)) {
       // Check if the symbol is a constant.
       if (auto cOp = value.getDefiningOp<arith::ConstantIndexOp>())
         cst->addBound(FlatAffineConstraints::EQ, value, cOp.value());
     } else if (auto loop = getForInductionVarOwner(value)) {
       if (failed(cst->addAffineForOpDomain(loop)))
         return failure();
     }
   }

   // Add slices bounds on 'ivs' using maps 'lbs'/'ubs' with 'lbOperands[0]'
   LogicalResult ret = cst->addSliceBounds(ivs, lbs, ubs, lbOperands[0]);
   assert(succeeded(ret) &&
          "should not fail as we never have semi-affine slice maps");
   (void)ret;
   return success();
 }

 // Clears state bounds and operand state.
 void ComputationSliceState::clearBounds() {
   lbs.clear();
   ubs.clear();
   lbOperands.clear();
   ubOperands.clear();
 }

 void ComputationSliceState::dump() const {
   llvm::errs() << "\tIVs:\n";
   for (Value iv : ivs)
     llvm::errs() << "\t\t" << iv << "\n";

   llvm::errs() << "\tLBs:\n";
   for (auto &en : llvm::enumerate(lbs)) {
     llvm::errs() << "\t\t" << en.value() << "\n";
     llvm::errs() << "\t\tOperands:\n";
     for (Value lbOp : lbOperands[en.index()])
       llvm::errs() << "\t\t\t" << lbOp << "\n";
   }

   llvm::errs() << "\tUBs:\n";
   for (auto &en : llvm::enumerate(ubs)) {
     llvm::errs() << "\t\t" << en.value() << "\n";
     llvm::errs() << "\t\tOperands:\n";
     for (Value ubOp : ubOperands[en.index()])
       llvm::errs() << "\t\t\t" << ubOp << "\n";
   }
 }

 /// Fast check to determine if the computation slice is maximal. Returns true if
 /// each slice dimension maps to an existing dst dimension and both the src
 /// and the dst loops for those dimensions have the same bounds. Returns false
 /// if both the src and the dst loops don't have the same bounds. Returns
 /// llvm::None if none of the above can be proven.
 Optional<bool> ComputationSliceState::isSliceMaximalFastCheck() const {
   assert(lbs.size() == ubs.size() && lbs.size() && ivs.size() &&
          "Unexpected number of lbs, ubs and ivs in slice");

   for (unsigned i = 0, end = lbs.size(); i < end; ++i) {
     AffineMap lbMap = lbs[i];
     AffineMap ubMap = ubs[i];

     // Check if this slice is just an equality along this dimension.
     if (!lbMap || !ubMap || lbMap.getNumResults() != 1 ||
         ubMap.getNumResults() != 1 ||
         lbMap.getResult(0) + 1 != ubMap.getResult(0) ||
         // The condition above will be true for maps describing a single
         // iteration (e.g., lbMap.getResult(0) = 0, ubMap.getResult(0) = 1).
         // Make sure we skip those cases by checking that the lb result is not
         // just a constant.
         lbMap.getResult(0).isa<AffineConstantExpr>())
       return llvm::None;

     // Limited support: we expect the lb result to be just a loop dimension for
     // now.
     AffineDimExpr result = lbMap.getResult(0).dyn_cast<AffineDimExpr>();
     if (!result)
       return llvm::None;

     // Retrieve dst loop bounds.
     AffineForOp dstLoop =
         getForInductionVarOwner(lbOperands[i][result.getPosition()]);
     if (!dstLoop)
       return llvm::None;
     AffineMap dstLbMap = dstLoop.getLowerBoundMap();
     AffineMap dstUbMap = dstLoop.getUpperBoundMap();

     // Retrieve src loop bounds.
     AffineForOp srcLoop = getForInductionVarOwner(ivs[i]);
     assert(srcLoop && "Expected affine for");
     AffineMap srcLbMap = srcLoop.getLowerBoundMap();
     AffineMap srcUbMap = srcLoop.getUpperBoundMap();

     // Limited support: we expect simple src and dst loops with a single
     // constant component per bound for now.
     if (srcLbMap.getNumResults() != 1 || srcUbMap.getNumResults() != 1 ||
         dstLbMap.getNumResults() != 1 || dstUbMap.getNumResults() != 1)
       return llvm::None;

     AffineExpr srcLbResult = srcLbMap.getResult(0);
     AffineExpr dstLbResult = dstLbMap.getResult(0);
     AffineExpr srcUbResult = srcUbMap.getResult(0);
     AffineExpr dstUbResult = dstUbMap.getResult(0);
     if (!srcLbResult.isa<AffineConstantExpr>() ||
         !srcUbResult.isa<AffineConstantExpr>() ||
         !dstLbResult.isa<AffineConstantExpr>() ||
         !dstUbResult.isa<AffineConstantExpr>())
       return llvm::None;

     // Check if src and dst loop bounds are the same. If not, we can guarantee
     // that the slice is not maximal.
     if (srcLbResult != dstLbResult || srcUbResult != dstUbResult ||
         srcLoop.getStep() != dstLoop.getStep())
       return false;
   }

   return true;
 }

 /// Returns true if it is deterministically verified that the original iteration
 /// space of the slice is contained within the new iteration space that is
 /// created after fusing 'this' slice into its destination.
 Optional<bool> ComputationSliceState::isSliceValid() {
   // Fast check to determine if the slice is valid. If the following conditions
   // are verified to be true, slice is declared valid by the fast check:
   // 1. Each slice loop is a single iteration loop bound in terms of a single
   //    destination loop IV.
   // 2. Loop bounds of the destination loop IV (from above) and those of the
   //    source loop IV are exactly the same.
   // If the fast check is inconclusive or false, we proceed with a more
   // expensive analysis.
   // TODO: Store the result of the fast check, as it might be used again in
   // `canRemoveSrcNodeAfterFusion`.
   Optional<bool> isValidFastCheck = isSliceMaximalFastCheck();
   if (isValidFastCheck.hasValue() && isValidFastCheck.getValue())
     return true;

   // Create constraints for the source loop nest using which slice is computed.
   FlatAffineValueConstraints srcConstraints;
   // TODO: Store the source's domain to avoid computation at each depth.
   if (failed(getSourceAsConstraints(srcConstraints))) {
     LLVM_DEBUG(llvm::dbgs() << "Unable to compute source's domain\n");
     return llvm::None;
   }
   // As the set difference utility currently cannot handle symbols in its
   // operands, validity of the slice cannot be determined.
   if (srcConstraints.getNumSymbolIds() > 0) {
     LLVM_DEBUG(llvm::dbgs() << "Cannot handle symbols in source domain\n");
     return llvm::None;
   }
   // TODO: Handle local ids in the source domains while using the 'projectOut'
   // utility below. Currently, aligning is not done assuming that there will be
   // no local ids in the source domain.
   if (srcConstraints.getNumLocalIds() != 0) {
     LLVM_DEBUG(llvm::dbgs() << "Cannot handle locals in source domain\n");
     return llvm::None;
   }

   // Create constraints for the slice loop nest that would be created if the
   // fusion succeeds.
   FlatAffineValueConstraints sliceConstraints;
   if (failed(getAsConstraints(&sliceConstraints))) {
     LLVM_DEBUG(llvm::dbgs() << "Unable to compute slice's domain\n");
     return llvm::None;
   }

   // Projecting out every dimension other than the 'ivs' to express slice's
   // domain completely in terms of source's IVs.
   sliceConstraints.projectOut(ivs.size(),
                               sliceConstraints.getNumIds() - ivs.size());

   LLVM_DEBUG(llvm::dbgs() << "Domain of the source of the slice:\n");
   LLVM_DEBUG(srcConstraints.dump());
   LLVM_DEBUG(llvm::dbgs() << "Domain of the slice if this fusion succeeds "
                              "(expressed in terms of its source's IVs):\n");
   LLVM_DEBUG(sliceConstraints.dump());

   // TODO: Store 'srcSet' to avoid recalculating for each depth.
   PresburgerSet srcSet(srcConstraints);
   PresburgerSet sliceSet(sliceConstraints);
   PresburgerSet diffSet = sliceSet.subtract(srcSet);

   if (!diffSet.isIntegerEmpty()) {
     LLVM_DEBUG(llvm::dbgs() << "Incorrect slice\n");
     return false;
   }
   return true;
 }

 /// Returns true if the computation slice encloses all the iterations of the
 /// sliced loop nest. Returns false if it does not. Returns llvm::None if it
 /// cannot determine if the slice is maximal or not.
 Optional<bool> ComputationSliceState::isMaximal() const {
   // Fast check to determine if the computation slice is maximal. If the result
   // is inconclusive, we proceed with a more expensive analysis.
   Optional<bool> isMaximalFastCheck = isSliceMaximalFastCheck();
   if (isMaximalFastCheck.hasValue())
     return isMaximalFastCheck;

   // Create constraints for the src loop nest being sliced.
   FlatAffineValueConstraints srcConstraints;
   srcConstraints.reset(/*numDims=*/ivs.size(), /*numSymbols=*/0,
                        /*numLocals=*/0, ivs);
   for (Value iv : ivs) {
     AffineForOp loop = getForInductionVarOwner(iv);
     assert(loop && "Expected affine for");
     if (failed(srcConstraints.addAffineForOpDomain(loop)))
       return llvm::None;
   }

   // Create constraints for the slice using the dst loop nest information. We
   // retrieve existing dst loops from the lbOperands.
   SmallVector<Value, 8> consumerIVs;
   for (Value lbOp : lbOperands[0])
     if (getForInductionVarOwner(lbOp))
       consumerIVs.push_back(lbOp);

   // Add empty IV Values for those new loops that are not equalities and,
   // therefore, are not yet materialized in the IR.
   for (int i = consumerIVs.size(), end = ivs.size(); i < end; ++i)
     consumerIVs.push_back(Value());

   FlatAffineValueConstraints sliceConstraints;
   sliceConstraints.reset(/*numDims=*/consumerIVs.size(), /*numSymbols=*/0,
                          /*numLocals=*/0, consumerIVs);

   if (failed(sliceConstraints.addDomainFromSliceMaps(lbs, ubs, lbOperands[0])))
     return llvm::None;

   if (srcConstraints.getNumDimIds() != sliceConstraints.getNumDimIds())
     // Constraint dims are different. The integer set difference can't be
     // computed so we don't know if the slice is maximal.
     return llvm::None;

   // Compute the difference between the src loop nest and the slice integer
   // sets.
   PresburgerSet srcSet(srcConstraints);
   PresburgerSet sliceSet(sliceConstraints);
   PresburgerSet diffSet = srcSet.subtract(sliceSet);
   return diffSet.isIntegerEmpty();
 }

 unsigned MemRefRegion::getRank() const {
   return memref.getType().cast<MemRefType>().getRank();
 }

 Optional<int64_t> MemRefRegion::getConstantBoundingSizeAndShape(
     SmallVectorImpl<int64_t> *shape, std::vector<SmallVector<int64_t, 4>> *lbs,
     SmallVectorImpl<int64_t> *lbDivisors) const {
   auto memRefType = memref.getType().cast<MemRefType>();
   unsigned rank = memRefType.getRank();
   if (shape)
     shape->reserve(rank);

   assert(rank == cst.getNumDimIds() && "inconsistent memref region");

   // Use a copy of the region constraints that has upper/lower bounds for each
   // memref dimension with static size added to guard against potential
   // over-approximation from projection or union bounding box. We may not add
   // this on the region itself since they might just be redundant constraints
   // that will need non-trivials means to eliminate.
   FlatAffineConstraints cstWithShapeBounds(cst);
   for (unsigned r = 0; r < rank; r++) {
     cstWithShapeBounds.addBound(FlatAffineConstraints::LB, r, 0);
     int64_t dimSize = memRefType.getDimSize(r);
     if (ShapedType::isDynamic(dimSize))
       continue;
     cstWithShapeBounds.addBound(FlatAffineConstraints::UB, r, dimSize - 1);
   }

   // Find a constant upper bound on the extent of this memref region along each
   // dimension.
   int64_t numElements = 1;
   int64_t diffConstant;
   int64_t lbDivisor;
   for (unsigned d = 0; d < rank; d++) {
     SmallVector<int64_t, 4> lb;
     Optional<int64_t> diff =
         cstWithShapeBounds.getConstantBoundOnDimSize(d, &lb, &lbDivisor);
     if (diff.hasValue()) {
       diffConstant = diff.getValue();
       assert(diffConstant >= 0 && "Dim size bound can't be negative");
       assert(lbDivisor > 0);
     } else {
       // If no constant bound is found, then it can always be bound by the
       // memref's dim size if the latter has a constant size along this dim.
       auto dimSize = memRefType.getDimSize(d);
       if (dimSize == -1)
         return None;
       diffConstant = dimSize;
       // Lower bound becomes 0.
       lb.resize(cstWithShapeBounds.getNumSymbolIds() + 1, 0);
       lbDivisor = 1;
     }
     numElements *= diffConstant;
     if (lbs) {
       lbs->push_back(lb);
       assert(lbDivisors && "both lbs and lbDivisor or none");
       lbDivisors->push_back(lbDivisor);
     }
     if (shape) {
       shape->push_back(diffConstant);
     }
   }
   return numElements;
 }

 void MemRefRegion::getLowerAndUpperBound(unsigned pos, AffineMap &lbMap,
                                          AffineMap &ubMap) const {
   assert(pos < cst.getNumDimIds() && "invalid position");
   auto memRefType = memref.getType().cast<MemRefType>();
   unsigned rank = memRefType.getRank();

   assert(rank == cst.getNumDimIds() && "inconsistent memref region");

   auto boundPairs = cst.getLowerAndUpperBound(
       pos, /*offset=*/0, /*num=*/rank, cst.getNumDimAndSymbolIds(),
       /*localExprs=*/{}, memRefType.getContext());
   lbMap = boundPairs.first;
   ubMap = boundPairs.second;
   assert(lbMap && "lower bound for a region must exist");
   assert(ubMap && "upper bound for a region must exist");
   assert(lbMap.getNumInputs() == cst.getNumDimAndSymbolIds() - rank);
   assert(ubMap.getNumInputs() == cst.getNumDimAndSymbolIds() - rank);
 }

 LogicalResult MemRefRegion::unionBoundingBox(const MemRefRegion &other) {
   assert(memref == other.memref);
   return cst.unionBoundingBox(*other.getConstraints());
 }

 /// Computes the memory region accessed by this memref with the region
 /// represented as constraints symbolic/parametric in 'loopDepth' loops
 /// surrounding opInst and any additional Function symbols.
 //  For example, the memref region for this load operation at loopDepth = 1 will
 //  be as below:
 //
 //    affine.for %i = 0 to 32 {
 //      affine.for %ii = %i to (d0) -> (d0 + 8) (%i) {
 //        load %A[%ii]
 //      }
 //    }
 //
 // region:  {memref = %A, write = false, {%i <= m0 <= %i + 7} }
 // The last field is a 2-d FlatAffineConstraints symbolic in %i.
 //
 // TODO: extend this to any other memref dereferencing ops
 // (dma_start, dma_wait).
 LogicalResult MemRefRegion::compute(Operation *op, unsigned loopDepth,
                                     const ComputationSliceState *sliceState,
                                     bool addMemRefDimBounds) {
   assert((isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) &&
          "affine read/write op expected");

   MemRefAccess access(op);
   memref = access.memref;
   write = access.isStore();

   unsigned rank = access.getRank();

   LLVM_DEBUG(llvm::dbgs() << "MemRefRegion::compute: " << *op
                           << "depth: " << loopDepth << "\n";);

   // 0-d memrefs.
   if (rank == 0) {
     SmallVector<AffineForOp, 4> ivs;
     getLoopIVs(*op, &ivs);
     assert(loopDepth <= ivs.size() && "invalid 'loopDepth'");
     // The first 'loopDepth' IVs are symbols for this region.
     ivs.resize(loopDepth);
     SmallVector<Value, 4> regionSymbols;
     extractForInductionVars(ivs, &regionSymbols);
     // A 0-d memref has a 0-d region.
     cst.reset(rank, loopDepth, /*numLocals=*/0, regionSymbols);
     return success();
   }

   // Build the constraints for this region.
   AffineValueMap accessValueMap;
   access.getAccessMap(&accessValueMap);
   AffineMap accessMap = accessValueMap.getAffineMap();

   unsigned numDims = accessMap.getNumDims();
   unsigned numSymbols = accessMap.getNumSymbols();
   unsigned numOperands = accessValueMap.getNumOperands();
   // Merge operands with slice operands.
   SmallVector<Value, 4> operands;
   operands.resize(numOperands);
   for (unsigned i = 0; i < numOperands; ++i)
     operands[i] = accessValueMap.getOperand(i);

   if (sliceState != nullptr) {
     operands.reserve(operands.size() + sliceState->lbOperands[0].size());
     // Append slice operands to 'operands' as symbols.
     for (auto extraOperand : sliceState->lbOperands[0]) {
       if (!llvm::is_contained(operands, extraOperand)) {
         operands.push_back(extraOperand);
         numSymbols++;
       }
     }
   }
   // We'll first associate the dims and symbols of the access map to the dims
   // and symbols resp. of cst. This will change below once cst is
   // fully constructed out.
   cst.reset(numDims, numSymbols, 0, operands);

   // Add equality constraints.
   // Add inequalities for loop lower/upper bounds.
   for (unsigned i = 0; i < numDims + numSymbols; ++i) {
     auto operand = operands[i];
     if (auto loop = getForInductionVarOwner(operand)) {
       // Note that cst can now have more dimensions than accessMap if the
       // bounds expressions involve outer loops or other symbols.
       // TODO: rewrite this to use getInstIndexSet; this way
       // conditionals will be handled when the latter supports it.
       if (failed(cst.addAffineForOpDomain(loop)))
         return failure();
     } else {
       // Has to be a valid symbol.
       auto symbol = operand;
       assert(isValidSymbol(symbol));
       // Check if the symbol is a constant.
       if (auto *op = symbol.getDefiningOp()) {
         if (auto constOp = dyn_cast<arith::ConstantIndexOp>(op)) {
           cst.addBound(FlatAffineConstraints::EQ, symbol, constOp.value());
         }
       }
     }
   }

   // Add lower/upper bounds on loop IVs using bounds from 'sliceState'.
   if (sliceState != nullptr) {
     // Add dim and symbol slice operands.
     for (auto operand : sliceState->lbOperands[0]) {
       cst.addInductionVarOrTerminalSymbol(operand);
     }
     // Add upper/lower bounds from 'sliceState' to 'cst'.
     LogicalResult ret =
         cst.addSliceBounds(sliceState->ivs, sliceState->lbs, sliceState->ubs,
                            sliceState->lbOperands[0]);
     assert(succeeded(ret) &&
            "should not fail as we never have semi-affine slice maps");
     (void)ret;
   }

   // Add access function equalities to connect loop IVs to data dimensions.
   if (failed(cst.composeMap(&accessValueMap))) {
     op->emitError("getMemRefRegion: compose affine map failed");
     LLVM_DEBUG(accessValueMap.getAffineMap().dump());
     return failure();
   }

   // Set all identifiers appearing after the first 'rank' identifiers as
   // symbolic identifiers - so that the ones corresponding to the memref
   // dimensions are the dimensional identifiers for the memref region.
   cst.setDimSymbolSeparation(cst.getNumDimAndSymbolIds() - rank);

   // Eliminate any loop IVs other than the outermost 'loopDepth' IVs, on which
   // this memref region is symbolic.
   SmallVector<AffineForOp, 4> enclosingIVs;
   getLoopIVs(*op, &enclosingIVs);
   assert(loopDepth <= enclosingIVs.size() && "invalid loop depth");
   enclosingIVs.resize(loopDepth);
   SmallVector<Value, 4> ids;
   cst.getValues(cst.getNumDimIds(), cst.getNumDimAndSymbolIds(), &ids);
   for (auto id : ids) {
     AffineForOp iv;
     if ((iv = getForInductionVarOwner(id)) &&
         llvm::is_contained(enclosingIVs, iv) == false) {
       cst.projectOut(id);
     }
   }

   // Project out any local variables (these would have been added for any
   // mod/divs).
   cst.projectOut(cst.getNumDimAndSymbolIds(), cst.getNumLocalIds());

   // Constant fold any symbolic identifiers.
   cst.constantFoldIdRange(/*pos=*/cst.getNumDimIds(),
                           /*num=*/cst.getNumSymbolIds());

   assert(cst.getNumDimIds() == rank && "unexpected MemRefRegion format");

   // Add upper/lower bounds for each memref dimension with static size
   // to guard against potential over-approximation from projection.
   // TODO: Support dynamic memref dimensions.
   if (addMemRefDimBounds) {
     auto memRefType = memref.getType().cast<MemRefType>();
     for (unsigned r = 0; r < rank; r++) {
       cst.addBound(FlatAffineConstraints::LB, /*pos=*/r, /*value=*/0);
       if (memRefType.isDynamicDim(r))
         continue;
       cst.addBound(FlatAffineConstraints::UB, /*pos=*/r,
                    memRefType.getDimSize(r) - 1);
     }
   }
   cst.removeTrivialRedundancy();

   LLVM_DEBUG(llvm::dbgs() << "Memory region:\n");
   LLVM_DEBUG(cst.dump());
   return success();
 }

 static unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {
   auto elementType = memRefType.getElementType();

   unsigned sizeInBits;
   if (elementType.isIntOrFloat()) {
     sizeInBits = elementType.getIntOrFloatBitWidth();
   } else {
     auto vectorType = elementType.cast<VectorType>();
     sizeInBits =
         vectorType.getElementTypeBitWidth() * vectorType.getNumElements();
   }
   return llvm::divideCeil(sizeInBits, 8);
 }

 // Returns the size of the region.
 Optional<int64_t> MemRefRegion::getRegionSize() {
   auto memRefType = memref.getType().cast<MemRefType>();

   if (!memRefType.getLayout().isIdentity()) {
     LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
     return false;
   }

   // Indices to use for the DmaStart op.
   // Indices for the original memref being DMAed from/to.
   SmallVector<Value, 4> memIndices;
   // Indices for the faster buffer being DMAed into/from.
   SmallVector<Value, 4> bufIndices;

   // Compute the extents of the buffer.
   Optional<int64_t> numElements = getConstantBoundingSizeAndShape();
   if (!numElements.hasValue()) {
     LLVM_DEBUG(llvm::dbgs() << "Dynamic shapes not yet supported\n");
     return None;
   }
   return getMemRefEltSizeInBytes(memRefType) * numElements.getValue();
 }

 /// Returns the size of memref data in bytes if it's statically shaped, None
 /// otherwise.  If the element of the memref has vector type, takes into account
 /// size of the vector as well.
 //  TODO: improve/complete this when we have target data.
 Optional<uint64_t> mlir::getMemRefSizeInBytes(MemRefType memRefType) {
   if (!memRefType.hasStaticShape())
     return None;
   auto elementType = memRefType.getElementType();
   if (!elementType.isIntOrFloat() && !elementType.isa<VectorType>())
     return None;

   uint64_t sizeInBytes = getMemRefEltSizeInBytes(memRefType);
   for (unsigned i = 0, e = memRefType.getRank(); i < e; i++) {
     sizeInBytes = sizeInBytes * memRefType.getDimSize(i);
   }
   return sizeInBytes;
 }

 template <typename LoadOrStoreOp>
 LogicalResult mlir::boundCheckLoadOrStoreOp(LoadOrStoreOp loadOrStoreOp,
                                             bool emitError) {
   static_assert(llvm::is_one_of<LoadOrStoreOp, AffineReadOpInterface,
                                 AffineWriteOpInterface>::value,
                 "argument should be either a AffineReadOpInterface or a "
                 "AffineWriteOpInterface");

   Operation *op = loadOrStoreOp.getOperation();
   MemRefRegion region(op->getLoc());
   if (failed(region.compute(op, /*loopDepth=*/0, /*sliceState=*/nullptr,
                             /*addMemRefDimBounds=*/false)))
     return success();

   LLVM_DEBUG(llvm::dbgs() << "Memory region");
   LLVM_DEBUG(region.getConstraints()->dump());

   bool outOfBounds = false;
   unsigned rank = loadOrStoreOp.getMemRefType().getRank();

   // For each dimension, check for out of bounds.
   for (unsigned r = 0; r < rank; r++) {
     FlatAffineConstraints ucst(*region.getConstraints());

     // Intersect memory region with constraint capturing out of bounds (both out
     // of upper and out of lower), and check if the constraint system is
     // feasible. If it is, there is at least one point out of bounds.
     SmallVector<int64_t, 4> ineq(rank + 1, 0);
     int64_t dimSize = loadOrStoreOp.getMemRefType().getDimSize(r);
     // TODO: handle dynamic dim sizes.
     if (dimSize == -1)
       continue;

     // Check for overflow: d_i >= memref dim size.
     ucst.addBound(FlatAffineConstraints::LB, r, dimSize);
     outOfBounds = !ucst.isEmpty();
     if (outOfBounds && emitError) {
       loadOrStoreOp.emitOpError()
           << "memref out of upper bound access along dimension #" << (r + 1);
     }

     // Check for a negative index.
     FlatAffineConstraints lcst(*region.getConstraints());
     std::fill(ineq.begin(), ineq.end(), 0);
     // d_i <= -1;
     lcst.addBound(FlatAffineConstraints::UB, r, -1);
     outOfBounds = !lcst.isEmpty();
     if (outOfBounds && emitError) {
       loadOrStoreOp.emitOpError()
           << "memref out of lower bound access along dimension #" << (r + 1);
     }
   }
   return failure(outOfBounds);
 }

 // Explicitly instantiate the template so that the compiler knows we need them!
 template LogicalResult
 mlir::boundCheckLoadOrStoreOp(AffineReadOpInterface loadOp, bool emitError);
 template LogicalResult
 mlir::boundCheckLoadOrStoreOp(AffineWriteOpInterface storeOp, bool emitError);

 // Returns in 'positions' the Block positions of 'op' in each ancestor
 // Block from the Block containing operation, stopping at 'limitBlock'.
 static void findInstPosition(Operation *op, Block *limitBlock,
                              SmallVectorImpl<unsigned> *positions) {
   Block *block = op->getBlock();
   while (block != limitBlock) {
     // FIXME: This algorithm is unnecessarily O(n) and should be improved to not
     // rely on linear scans.
     int instPosInBlock = std::distance(block->begin(), op->getIterator());
     positions->push_back(instPosInBlock);
     op = block->getParentOp();
     block = op->getBlock();
   }
   std::reverse(positions->begin(), positions->end());
 }

 // Returns the Operation in a possibly nested set of Blocks, where the
 // position of the operation is represented by 'positions', which has a
 // Block position for each level of nesting.
 static Operation *getInstAtPosition(ArrayRef<unsigned> positions,
                                     unsigned level, Block *block) {
   unsigned i = 0;
   for (auto &op : *block) {
     if (i != positions[level]) {
       ++i;
       continue;
     }
     if (level == positions.size() - 1)
       return &op;
     if (auto childAffineForOp = dyn_cast<AffineForOp>(op))
       return getInstAtPosition(positions, level + 1,
                                childAffineForOp.getBody());

     for (auto &region : op.getRegions()) {
       for (auto &b : region)
         if (auto *ret = getInstAtPosition(positions, level + 1, &b))
           return ret;
     }
     return nullptr;
   }
   return nullptr;
 }

 // Adds loop IV bounds to 'cst' for loop IVs not found in 'ivs'.
 static LogicalResult addMissingLoopIVBounds(SmallPtrSet<Value, 8> &ivs,
                                             FlatAffineValueConstraints *cst) {
   for (unsigned i = 0, e = cst->getNumDimIds(); i < e; ++i) {
     auto value = cst->getValue(i);
     if (ivs.count(value) == 0) {
       assert(isForInductionVar(value));
       auto loop = getForInductionVarOwner(value);
       if (failed(cst->addAffineForOpDomain(loop)))
         return failure();
     }
   }
   return success();
 }

 /// Returns the innermost common loop depth for the set of operations in 'ops'.
 // TODO: Move this to LoopUtils.
 unsigned mlir::getInnermostCommonLoopDepth(
     ArrayRef<Operation *> ops, SmallVectorImpl<AffineForOp> *surroundingLoops) {
   unsigned numOps = ops.size();
   assert(numOps > 0 && "Expected at least one operation");

   std::vector<SmallVector<AffineForOp, 4>> loops(numOps);
   unsigned loopDepthLimit = std::numeric_limits<unsigned>::max();
   for (unsigned i = 0; i < numOps; ++i) {
     getLoopIVs(*ops[i], &loops[i]);
     loopDepthLimit =
         std::min(loopDepthLimit, static_cast<unsigned>(loops[i].size()));
   }

   unsigned loopDepth = 0;
   for (unsigned d = 0; d < loopDepthLimit; ++d) {
     unsigned i;
     for (i = 1; i < numOps; ++i) {
       if (loops[i - 1][d] != loops[i][d])
         return loopDepth;
     }
     if (surroundingLoops)
       surroundingLoops->push_back(loops[i - 1][d]);
     ++loopDepth;
   }
   return loopDepth;
 }

 /// Computes in 'sliceUnion' the union of all slice bounds computed at
 /// 'loopDepth' between all dependent pairs of ops in 'opsA' and 'opsB', and
 /// then verifies if it is valid. Returns 'SliceComputationResult::Success' if
 /// union was computed correctly, an appropriate failure otherwise.
 SliceComputationResult
 mlir::computeSliceUnion(ArrayRef<Operation *> opsA, ArrayRef<Operation *> opsB,
                         unsigned loopDepth, unsigned numCommonLoops,
                         bool isBackwardSlice,
                         ComputationSliceState *sliceUnion) {
   // Compute the union of slice bounds between all pairs in 'opsA' and
   // 'opsB' in 'sliceUnionCst'.
   FlatAffineValueConstraints sliceUnionCst;
   assert(sliceUnionCst.getNumDimAndSymbolIds() == 0);
   std::vector<std::pair<Operation *, Operation *>> dependentOpPairs;
   for (unsigned i = 0, numOpsA = opsA.size(); i < numOpsA; ++i) {
     MemRefAccess srcAccess(opsA[i]);
     for (unsigned j = 0, numOpsB = opsB.size(); j < numOpsB; ++j) {
       MemRefAccess dstAccess(opsB[j]);
       if (srcAccess.memref != dstAccess.memref)
         continue;
       // Check if 'loopDepth' exceeds nesting depth of src/dst ops.
       if ((!isBackwardSlice && loopDepth > getNestingDepth(opsA[i])) ||
           (isBackwardSlice && loopDepth > getNestingDepth(opsB[j]))) {
         LLVM_DEBUG(llvm::dbgs() << "Invalid loop depth\n");
         return SliceComputationResult::GenericFailure;
       }

       bool readReadAccesses = isa<AffineReadOpInterface>(srcAccess.opInst) &&
                               isa<AffineReadOpInterface>(dstAccess.opInst);
       FlatAffineValueConstraints dependenceConstraints;
       // Check dependence between 'srcAccess' and 'dstAccess'.
       DependenceResult result = checkMemrefAccessDependence(
           srcAccess, dstAccess, /*loopDepth=*/numCommonLoops + 1,
           &dependenceConstraints, /*dependenceComponents=*/nullptr,
           /*allowRAR=*/readReadAccesses);
       if (result.value == DependenceResult::Failure) {
         LLVM_DEBUG(llvm::dbgs() << "Dependence check failed\n");
         return SliceComputationResult::GenericFailure;
       }
       if (result.value == DependenceResult::NoDependence)
         continue;
       dependentOpPairs.push_back({opsA[i], opsB[j]});

       // Compute slice bounds for 'srcAccess' and 'dstAccess'.
       ComputationSliceState tmpSliceState;
       mlir::getComputationSliceState(opsA[i], opsB[j], &dependenceConstraints,
                                      loopDepth, isBackwardSlice,
                                      &tmpSliceState);

       if (sliceUnionCst.getNumDimAndSymbolIds() == 0) {
         // Initialize 'sliceUnionCst' with the bounds computed in previous step.
         if (failed(tmpSliceState.getAsConstraints(&sliceUnionCst))) {
           LLVM_DEBUG(llvm::dbgs()
                      << "Unable to compute slice bound constraints\n");
           return SliceComputationResult::GenericFailure;
         }
         assert(sliceUnionCst.getNumDimAndSymbolIds() > 0);
         continue;
       }

       // Compute constraints for 'tmpSliceState' in 'tmpSliceCst'.
       FlatAffineValueConstraints tmpSliceCst;
       if (failed(tmpSliceState.getAsConstraints(&tmpSliceCst))) {
         LLVM_DEBUG(llvm::dbgs()
                    << "Unable to compute slice bound constraints\n");
         return SliceComputationResult::GenericFailure;
       }

       // Align coordinate spaces of 'sliceUnionCst' and 'tmpSliceCst' if needed.
       if (!sliceUnionCst.areIdsAlignedWithOther(tmpSliceCst)) {

         // Pre-constraint id alignment: record loop IVs used in each constraint
         // system.
         SmallPtrSet<Value, 8> sliceUnionIVs;
         for (unsigned k = 0, l = sliceUnionCst.getNumDimIds(); k < l; ++k)
           sliceUnionIVs.insert(sliceUnionCst.getValue(k));
         SmallPtrSet<Value, 8> tmpSliceIVs;
         for (unsigned k = 0, l = tmpSliceCst.getNumDimIds(); k < l; ++k)
           tmpSliceIVs.insert(tmpSliceCst.getValue(k));

         sliceUnionCst.mergeAndAlignIdsWithOther(/*offset=*/0, &tmpSliceCst);

         // Post-constraint id alignment: add loop IV bounds missing after
         // id alignment to constraint systems. This can occur if one constraint
         // system uses an loop IV that is not used by the other. The call
         // to unionBoundingBox below expects constraints for each Loop IV, even
         // if they are the unsliced full loop bounds added here.
         if (failed(addMissingLoopIVBounds(sliceUnionIVs, &sliceUnionCst)))
           return SliceComputationResult::GenericFailure;
         if (failed(addMissingLoopIVBounds(tmpSliceIVs, &tmpSliceCst)))
           return SliceComputationResult::GenericFailure;
       }
       // Compute union bounding box of 'sliceUnionCst' and 'tmpSliceCst'.
       if (sliceUnionCst.getNumLocalIds() > 0 ||
           tmpSliceCst.getNumLocalIds() > 0 ||
           failed(sliceUnionCst.unionBoundingBox(tmpSliceCst))) {
         LLVM_DEBUG(llvm::dbgs()
                    << "Unable to compute union bounding box of slice bounds\n");
         return SliceComputationResult::GenericFailure;
       }
     }
   }

   // Empty union.
   if (sliceUnionCst.getNumDimAndSymbolIds() == 0)
     return SliceComputationResult::GenericFailure;

   // Gather loops surrounding ops from loop nest where slice will be inserted.
   SmallVector<Operation *, 4> ops;
   for (auto &dep : dependentOpPairs) {
     ops.push_back(isBackwardSlice ? dep.second : dep.first);
   }
   SmallVector<AffineForOp, 4> surroundingLoops;
   unsigned innermostCommonLoopDepth =
       getInnermostCommonLoopDepth(ops, &surroundingLoops);
   if (loopDepth > innermostCommonLoopDepth) {
     LLVM_DEBUG(llvm::dbgs() << "Exceeds max loop depth\n");
     return SliceComputationResult::GenericFailure;
   }

   // Store 'numSliceLoopIVs' before converting dst loop IVs to dims.
   unsigned numSliceLoopIVs = sliceUnionCst.getNumDimIds();

   // Convert any dst loop IVs which are symbol identifiers to dim identifiers.
   sliceUnionCst.convertLoopIVSymbolsToDims();
   sliceUnion->clearBounds();
   sliceUnion->lbs.resize(numSliceLoopIVs, AffineMap());
   sliceUnion->ubs.resize(numSliceLoopIVs, AffineMap());

   // Get slice bounds from slice union constraints 'sliceUnionCst'.
   sliceUnionCst.getSliceBounds(/*offset=*/0, numSliceLoopIVs,
                                opsA[0]->getContext(), &sliceUnion->lbs,
                                &sliceUnion->ubs);

   // Add slice bound operands of union.
   SmallVector<Value, 4> sliceBoundOperands;
   sliceUnionCst.getValues(numSliceLoopIVs,
                           sliceUnionCst.getNumDimAndSymbolIds(),
                           &sliceBoundOperands);

   // Copy src loop IVs from 'sliceUnionCst' to 'sliceUnion'.
   sliceUnion->ivs.clear();
   sliceUnionCst.getValues(0, numSliceLoopIVs, &sliceUnion->ivs);

   // Set loop nest insertion point to block start at 'loopDepth'.
   sliceUnion->insertPoint =
       isBackwardSlice
           ? surroundingLoops[loopDepth - 1].getBody()->begin()
           : std::prev(surroundingLoops[loopDepth - 1].getBody()->end());

   // Give each bound its own copy of 'sliceBoundOperands' for subsequent
   // canonicalization.
   sliceUnion->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands);
   sliceUnion->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands);

   // Check if the slice computed is valid. Return success only if it is verified
   // that the slice is valid, otherwise return appropriate failure status.
   Optional<bool> isSliceValid = sliceUnion->isSliceValid();
   if (!isSliceValid.hasValue()) {
     LLVM_DEBUG(llvm::dbgs() << "Cannot determine if the slice is valid\n");
     return SliceComputationResult::GenericFailure;
   }
   if (!isSliceValid.getValue())
     return SliceComputationResult::IncorrectSliceFailure;

   return SliceComputationResult::Success;
 }

 // TODO: extend this to handle multiple result maps.
 static Optional<uint64_t> getConstDifference(AffineMap lbMap, AffineMap ubMap) {
   assert(lbMap.getNumResults() == 1 && "expected single result bound map");
   assert(ubMap.getNumResults() == 1 && "expected single result bound map");
   assert(lbMap.getNumDims() == ubMap.getNumDims());
   assert(lbMap.getNumSymbols() == ubMap.getNumSymbols());
   AffineExpr lbExpr(lbMap.getResult(0));
   AffineExpr ubExpr(ubMap.getResult(0));
   auto loopSpanExpr = simplifyAffineExpr(ubExpr - lbExpr, lbMap.getNumDims(),
                                          lbMap.getNumSymbols());
   auto cExpr = loopSpanExpr.dyn_cast<AffineConstantExpr>();
   if (!cExpr)
     return None;
   return cExpr.getValue();
 }

 // Builds a map 'tripCountMap' from AffineForOp to constant trip count for loop
 // nest surrounding represented by slice loop bounds in 'slice'. Returns true
 // on success, false otherwise (if a non-constant trip count was encountered).
 // TODO: Make this work with non-unit step loops.
 bool mlir::buildSliceTripCountMap(
     const ComputationSliceState &slice,
     llvm::SmallDenseMap<Operation *, uint64_t, 8> *tripCountMap) {
   unsigned numSrcLoopIVs = slice.ivs.size();
   // Populate map from AffineForOp -> trip count
   for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
     AffineForOp forOp = getForInductionVarOwner(slice.ivs[i]);
     auto *op = forOp.getOperation();
     AffineMap lbMap = slice.lbs[i];
     AffineMap ubMap = slice.ubs[i];
     // If lower or upper bound maps are null or provide no results, it implies
     // that source loop was not at all sliced, and the entire loop will be a
     // part of the slice.
     if (!lbMap || lbMap.getNumResults() == 0 || !ubMap ||
         ubMap.getNumResults() == 0) {
       // The iteration of src loop IV 'i' was not sliced. Use full loop bounds.
       if (forOp.hasConstantLowerBound() && forOp.hasConstantUpperBound()) {
         (*tripCountMap)[op] =
             forOp.getConstantUpperBound() - forOp.getConstantLowerBound();
         continue;
       }
       Optional<uint64_t> maybeConstTripCount = getConstantTripCount(forOp);
       if (maybeConstTripCount.hasValue()) {
         (*tripCountMap)[op] = maybeConstTripCount.getValue();
         continue;
       }
       return false;
     }
     Optional<uint64_t> tripCount = getConstDifference(lbMap, ubMap);
     // Slice bounds are created with a constant ub - lb difference.
     if (!tripCount.hasValue())
       return false;
     (*tripCountMap)[op] = tripCount.getValue();
   }
   return true;
 }

 // Return the number of iterations in the given slice.
 uint64_t mlir::getSliceIterationCount(
     const llvm::SmallDenseMap<Operation *, uint64_t, 8> &sliceTripCountMap) {
   uint64_t iterCount = 1;
   for (const auto &count : sliceTripCountMap) {
     iterCount *= count.second;
   }
   return iterCount;
 }

 const char *const kSliceFusionBarrierAttrName = "slice_fusion_barrier";
 // Computes slice bounds by projecting out any loop IVs from
 // 'dependenceConstraints' at depth greater than 'loopDepth', and computes slice
 // bounds in 'sliceState' which represent the one loop nest's IVs in terms of
 // the other loop nest's IVs, symbols and constants (using 'isBackwardsSlice').
 void mlir::getComputationSliceState(
     Operation *depSourceOp, Operation *depSinkOp,
     FlatAffineValueConstraints *dependenceConstraints, unsigned loopDepth,
     bool isBackwardSlice, ComputationSliceState *sliceState) {
   // Get loop nest surrounding src operation.
   SmallVector<AffineForOp, 4> srcLoopIVs;
   getLoopIVs(*depSourceOp, &srcLoopIVs);
   unsigned numSrcLoopIVs = srcLoopIVs.size();

   // Get loop nest surrounding dst operation.
   SmallVector<AffineForOp, 4> dstLoopIVs;
   getLoopIVs(*depSinkOp, &dstLoopIVs);
   unsigned numDstLoopIVs = dstLoopIVs.size();

   assert((!isBackwardSlice && loopDepth <= numSrcLoopIVs) ||
          (isBackwardSlice && loopDepth <= numDstLoopIVs));

   // Project out dimensions other than those up to 'loopDepth'.
   unsigned pos = isBackwardSlice ? numSrcLoopIVs + loopDepth : loopDepth;
   unsigned num =
       isBackwardSlice ? numDstLoopIVs - loopDepth : numSrcLoopIVs - loopDepth;
   dependenceConstraints->projectOut(pos, num);

   // Add slice loop IV values to 'sliceState'.
   unsigned offset = isBackwardSlice ? 0 : loopDepth;
   unsigned numSliceLoopIVs = isBackwardSlice ? numSrcLoopIVs : numDstLoopIVs;
   dependenceConstraints->getValues(offset, offset + numSliceLoopIVs,
                                    &sliceState->ivs);

   // Set up lower/upper bound affine maps for the slice.
   sliceState->lbs.resize(numSliceLoopIVs, AffineMap());
   sliceState->ubs.resize(numSliceLoopIVs, AffineMap());

   // Get bounds for slice IVs in terms of other IVs, symbols, and constants.
   dependenceConstraints->getSliceBounds(offset, numSliceLoopIVs,
                                         depSourceOp->getContext(),
                                         &sliceState->lbs, &sliceState->ubs);

   // Set up bound operands for the slice's lower and upper bounds.
   SmallVector<Value, 4> sliceBoundOperands;
   unsigned numDimsAndSymbols = dependenceConstraints->getNumDimAndSymbolIds();
   for (unsigned i = 0; i < numDimsAndSymbols; ++i) {
     if (i < offset || i >= offset + numSliceLoopIVs) {
       sliceBoundOperands.push_back(dependenceConstraints->getValue(i));
     }
   }

   // Give each bound its own copy of 'sliceBoundOperands' for subsequent
   // canonicalization.
   sliceState->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands);
   sliceState->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands);

   // Set destination loop nest insertion point to block start at 'dstLoopDepth'.
   sliceState->insertPoint =
       isBackwardSlice ? dstLoopIVs[loopDepth - 1].getBody()->begin()
                       : std::prev(srcLoopIVs[loopDepth - 1].getBody()->end());

   llvm::SmallDenseSet<Value, 8> sequentialLoops;
   if (isa<AffineReadOpInterface>(depSourceOp) &&
       isa<AffineReadOpInterface>(depSinkOp)) {
     // For read-read access pairs, clear any slice bounds on sequential loops.
     // Get sequential loops in loop nest rooted at 'srcLoopIVs[0]'.
     getSequentialLoops(isBackwardSlice ? srcLoopIVs[0] : dstLoopIVs[0],
                        &sequentialLoops);
   }
   auto getSliceLoop = [&](unsigned i) {
     return isBackwardSlice ? srcLoopIVs[i] : dstLoopIVs[i];
   };
   auto isInnermostInsertion = [&]() {
     return (isBackwardSlice ? loopDepth >= srcLoopIVs.size()
                             : loopDepth >= dstLoopIVs.size());
   };
   llvm::SmallDenseMap<Operation *, uint64_t, 8> sliceTripCountMap;
   auto srcIsUnitSlice = [&]() {
     return (buildSliceTripCountMap(*sliceState, &sliceTripCountMap) &&
             (getSliceIterationCount(sliceTripCountMap) == 1));
   };
   // Clear all sliced loop bounds beginning at the first sequential loop, or
   // first loop with a slice fusion barrier attribute..

   for (unsigned i = 0; i < numSliceLoopIVs; ++i) {
     Value iv = getSliceLoop(i).getInductionVar();
     if (sequentialLoops.count(iv) == 0 &&
         getSliceLoop(i)->getAttr(kSliceFusionBarrierAttrName) == nullptr)
       continue;
     // Skip reset of bounds of reduction loop inserted in the destination loop
     // that meets the following conditions:
     //    1. Slice is  single trip count.
     //    2. Loop bounds of the source and destination match.
     //    3. Is being inserted at the innermost insertion point.
     Optional<bool> isMaximal = sliceState->isMaximal();
     if (isLoopParallelAndContainsReduction(getSliceLoop(i)) &&
         isInnermostInsertion() && srcIsUnitSlice() && isMaximal.hasValue() &&
         isMaximal.getValue())
       continue;
     for (unsigned j = i; j < numSliceLoopIVs; ++j) {
       sliceState->lbs[j] = AffineMap();
       sliceState->ubs[j] = AffineMap();
     }
     break;
   }
 }

 /// Creates a computation slice of the loop nest surrounding 'srcOpInst',
 /// updates the slice loop bounds with any non-null bound maps specified in
 /// 'sliceState', and inserts this slice into the loop nest surrounding
 /// 'dstOpInst' at loop depth 'dstLoopDepth'.
 // TODO: extend the slicing utility to compute slices that
 // aren't necessarily a one-to-one relation b/w the source and destination. The
 // relation between the source and destination could be many-to-many in general.
 // TODO: the slice computation is incorrect in the cases
 // where the dependence from the source to the destination does not cover the
 // entire destination index set. Subtract out the dependent destination
 // iterations from destination index set and check for emptiness --- this is one
 // solution.
 AffineForOp
 mlir::insertBackwardComputationSlice(Operation *srcOpInst, Operation *dstOpInst,
                                      unsigned dstLoopDepth,
                                      ComputationSliceState *sliceState) {
   // Get loop nest surrounding src operation.
   SmallVector<AffineForOp, 4> srcLoopIVs;
   getLoopIVs(*srcOpInst, &srcLoopIVs);
   unsigned numSrcLoopIVs = srcLoopIVs.size();

   // Get loop nest surrounding dst operation.
   SmallVector<AffineForOp, 4> dstLoopIVs;
   getLoopIVs(*dstOpInst, &dstLoopIVs);
   unsigned dstLoopIVsSize = dstLoopIVs.size();
   if (dstLoopDepth > dstLoopIVsSize) {
     dstOpInst->emitError("invalid destination loop depth");
     return AffineForOp();
   }

   // Find the op block positions of 'srcOpInst' within 'srcLoopIVs'.
   SmallVector<unsigned, 4> positions;
   // TODO: This code is incorrect since srcLoopIVs can be 0-d.
   findInstPosition(srcOpInst, srcLoopIVs[0]->getBlock(), &positions);

   // Clone src loop nest and insert it a the beginning of the operation block
   // of the loop at 'dstLoopDepth' in 'dstLoopIVs'.
   auto dstAffineForOp = dstLoopIVs[dstLoopDepth - 1];
   OpBuilder b(dstAffineForOp.getBody(), dstAffineForOp.getBody()->begin());
   auto sliceLoopNest =
       cast<AffineForOp>(b.clone(*srcLoopIVs[0].getOperation()));

   Operation *sliceInst =
       getInstAtPosition(positions, /*level=*/0, sliceLoopNest.getBody());
   // Get loop nest surrounding 'sliceInst'.
   SmallVector<AffineForOp, 4> sliceSurroundingLoops;
   getLoopIVs(*sliceInst, &sliceSurroundingLoops);

   // Sanity check.
   unsigned sliceSurroundingLoopsSize = sliceSurroundingLoops.size();
   (void)sliceSurroundingLoopsSize;
   assert(dstLoopDepth + numSrcLoopIVs >= sliceSurroundingLoopsSize);
   unsigned sliceLoopLimit = dstLoopDepth + numSrcLoopIVs;
   (void)sliceLoopLimit;
   assert(sliceLoopLimit >= sliceSurroundingLoopsSize);

   // Update loop bounds for loops in 'sliceLoopNest'.
   for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
     auto forOp = sliceSurroundingLoops[dstLoopDepth + i];
     if (AffineMap lbMap = sliceState->lbs[i])
       forOp.setLowerBound(sliceState->lbOperands[i], lbMap);
     if (AffineMap ubMap = sliceState->ubs[i])
       forOp.setUpperBound(sliceState->ubOperands[i], ubMap);
   }
   return sliceLoopNest;
 }

 // Constructs  MemRefAccess populating it with the memref, its indices and
 // opinst from 'loadOrStoreOpInst'.
 MemRefAccess::MemRefAccess(Operation *loadOrStoreOpInst) {
   if (auto loadOp = dyn_cast<AffineReadOpInterface>(loadOrStoreOpInst)) {
     memref = loadOp.getMemRef();
     opInst = loadOrStoreOpInst;
     auto loadMemrefType = loadOp.getMemRefType();
     indices.reserve(loadMemrefType.getRank());
     for (auto index : loadOp.getMapOperands()) {
       indices.push_back(index);
     }
   } else {
     assert(isa<AffineWriteOpInterface>(loadOrStoreOpInst) &&
            "Affine read/write op expected");
     auto storeOp = cast<AffineWriteOpInterface>(loadOrStoreOpInst);
     opInst = loadOrStoreOpInst;
     memref = storeOp.getMemRef();
     auto storeMemrefType = storeOp.getMemRefType();
     indices.reserve(storeMemrefType.getRank());
     for (auto index : storeOp.getMapOperands()) {
       indices.push_back(index);
     }
   }
 }

 unsigned MemRefAccess::getRank() const {
   return memref.getType().cast<MemRefType>().getRank();
 }

 bool MemRefAccess::isStore() const {
   return isa<AffineWriteOpInterface>(opInst);
 }

 /// Returns the nesting depth of this statement, i.e., the number of loops
 /// surrounding this statement.
 unsigned mlir::getNestingDepth(Operation *op) {
   Operation *currOp = op;
   unsigned depth = 0;
   while ((currOp = currOp->getParentOp())) {
     if (isa<AffineForOp>(currOp))
       depth++;
   }
   return depth;
 }

 /// Equal if both affine accesses are provably equivalent (at compile
 /// time) when considering the memref, the affine maps and their respective
 /// operands. The equality of access functions + operands is checked by
 /// subtracting fully composed value maps, and then simplifying the difference
 /// using the expression flattener.
 /// TODO: this does not account for aliasing of memrefs.
 bool MemRefAccess::operator==(const MemRefAccess &rhs) const {
   if (memref != rhs.memref)
     return false;

   AffineValueMap diff, thisMap, rhsMap;
   getAccessMap(&thisMap);
   rhs.getAccessMap(&rhsMap);
   AffineValueMap::difference(thisMap, rhsMap, &diff);
   return llvm::all_of(diff.getAffineMap().getResults(),
                       [](AffineExpr e) { return e == 0; });
 }

 /// Returns the number of surrounding loops common to 'loopsA' and 'loopsB',
 /// where each lists loops from outer-most to inner-most in loop nest.
 unsigned mlir::getNumCommonSurroundingLoops(Operation &A, Operation &B) {
   SmallVector<AffineForOp, 4> loopsA, loopsB;
   getLoopIVs(A, &loopsA);
   getLoopIVs(B, &loopsB);

   unsigned minNumLoops = std::min(loopsA.size(), loopsB.size());
   unsigned numCommonLoops = 0;
   for (unsigned i = 0; i < minNumLoops; ++i) {
     if (loopsA[i].getOperation() != loopsB[i].getOperation())
       break;
     ++numCommonLoops;
   }
   return numCommonLoops;
 }

 static Optional<int64_t> getMemoryFootprintBytes(Block &block,
                                                  Block::iterator start,
                                                  Block::iterator end,
                                                  int memorySpace) {
   SmallDenseMap<Value, std::unique_ptr<MemRefRegion>, 4> regions;

   // Walk this 'affine.for' operation to gather all memory regions.
   auto result = block.walk(start, end, [&](Operation *opInst) -> WalkResult {
     if (!isa<AffineReadOpInterface, AffineWriteOpInterface>(opInst)) {
       // Neither load nor a store op.
       return WalkResult::advance();
     }

     // Compute the memref region symbolic in any IVs enclosing this block.
     auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
     if (failed(
             region->compute(opInst,
                             /*loopDepth=*/getNestingDepth(&*block.begin())))) {
       return opInst->emitError("error obtaining memory region\n");
     }

     auto it = regions.find(region->memref);
     if (it == regions.end()) {
       regions[region->memref] = std::move(region);
     } else if (failed(it->second->unionBoundingBox(*region))) {
       return opInst->emitWarning(
           "getMemoryFootprintBytes: unable to perform a union on a memory "
           "region");
     }
     return WalkResult::advance();
   });
   if (result.wasInterrupted())
     return None;

   int64_t totalSizeInBytes = 0;
   for (const auto &region : regions) {
     Optional<int64_t> size = region.second->getRegionSize();
     if (!size.hasValue())
       return None;
     totalSizeInBytes += size.getValue();
   }
   return totalSizeInBytes;
 }

 Optional<int64_t> mlir::getMemoryFootprintBytes(AffineForOp forOp,
                                                 int memorySpace) {
   auto *forInst = forOp.getOperation();
   return ::getMemoryFootprintBytes(
       *forInst->getBlock(), Block::iterator(forInst),
       std::next(Block::iterator(forInst)), memorySpace);
 }

 /// Returns whether a loop is parallel and contains a reduction loop.
 bool mlir::isLoopParallelAndContainsReduction(AffineForOp forOp) {
   SmallVector<LoopReduction> reductions;
   if (!isLoopParallel(forOp, &reductions))
     return false;
   return !reductions.empty();
 }

 /// Returns in 'sequentialLoops' all sequential loops in loop nest rooted
 /// at 'forOp'.
 void mlir::getSequentialLoops(AffineForOp forOp,
                               llvm::SmallDenseSet<Value, 8> *sequentialLoops) {
   forOp->walk([&](Operation *op) {
     if (auto innerFor = dyn_cast<AffineForOp>(op))
       if (!isLoopParallel(innerFor))
         sequentialLoops->insert(innerFor.getInductionVar());
   });
 }

 IntegerSet mlir::simplifyIntegerSet(IntegerSet set) {
   FlatAffineConstraints fac(set);
   if (fac.isEmpty())
     return IntegerSet::getEmptySet(set.getNumDims(), set.getNumSymbols(),
                                    set.getContext());
   fac.removeTrivialRedundancy();

   auto simplifiedSet = fac.getAsIntegerSet(set.getContext());
   assert(simplifiedSet && "guaranteed to succeed while roundtripping");
   return simplifiedSet;
 }