mlir/lib/Analysis/LoopAnalysis.cpp - llvm-project - Git at Google

 //===- LoopAnalysis.cpp - Misc loop analysis routines //-------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements miscellaneous loop analysis routines.
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Analysis/LoopAnalysis.h"

 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/NestedMatcher.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Support/MathExtras.h"

 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include <type_traits>

 using namespace mlir;

 /// Returns the trip count of the loop as an affine expression if the latter is
 /// expressible as an affine expression, and nullptr otherwise. The trip count
 /// expression is simplified before returning. This method only utilizes map
 /// composition to construct lower and upper bounds before computing the trip
 /// count expressions.
 void mlir::getTripCountMapAndOperands(
     AffineForOp forOp, AffineMap *tripCountMap,
     SmallVectorImpl<Value> *tripCountOperands) {
   MLIRContext *context = forOp.getContext();
   int64_t step = forOp.getStep();
   int64_t loopSpan;
   if (forOp.hasConstantBounds()) {
     int64_t lb = forOp.getConstantLowerBound();
     int64_t ub = forOp.getConstantUpperBound();
     loopSpan = ub - lb;
     if (loopSpan < 0)
       loopSpan = 0;
     *tripCountMap = AffineMap::getConstantMap(ceilDiv(loopSpan, step), context);
     tripCountOperands->clear();
     return;
   }
   auto lbMap = forOp.getLowerBoundMap();
   auto ubMap = forOp.getUpperBoundMap();
   if (lbMap.getNumResults() != 1) {
     *tripCountMap = AffineMap();
     return;
   }

   // Difference of each upper bound expression from the single lower bound
   // expression (divided by the step) provides the expressions for the trip
   // count map.
   AffineValueMap ubValueMap(ubMap, forOp.getUpperBoundOperands());

   SmallVector<AffineExpr, 4> lbSplatExpr(ubValueMap.getNumResults(),
                                          lbMap.getResult(0));
   auto lbMapSplat = AffineMap::get(lbMap.getNumDims(), lbMap.getNumSymbols(),
                                    lbSplatExpr, context);
   AffineValueMap lbSplatValueMap(lbMapSplat, forOp.getLowerBoundOperands());

   AffineValueMap tripCountValueMap;
   AffineValueMap::difference(ubValueMap, lbSplatValueMap, &tripCountValueMap);
   for (unsigned i = 0, e = tripCountValueMap.getNumResults(); i < e; ++i)
     tripCountValueMap.setResult(i,
                                 tripCountValueMap.getResult(i).ceilDiv(step));

   *tripCountMap = tripCountValueMap.getAffineMap();
   tripCountOperands->assign(tripCountValueMap.getOperands().begin(),
                             tripCountValueMap.getOperands().end());
 }

 /// Returns the trip count of the loop if it's a constant, None otherwise. This
 /// method uses affine expression analysis (in turn using getTripCount) and is
 /// able to determine constant trip count in non-trivial cases.
 Optional<uint64_t> mlir::getConstantTripCount(AffineForOp forOp) {
   SmallVector<Value, 4> operands;
   AffineMap map;
   getTripCountMapAndOperands(forOp, &map, &operands);

   if (!map)
     return None;

   // Take the min if all trip counts are constant.
   Optional<uint64_t> tripCount;
   for (auto resultExpr : map.getResults()) {
     if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
       if (tripCount.hasValue())
         tripCount = std::min(tripCount.getValue(),
                              static_cast<uint64_t>(constExpr.getValue()));
       else
         tripCount = constExpr.getValue();
     } else
       return None;
   }
   return tripCount;
 }

 /// Returns the greatest known integral divisor of the trip count. Affine
 /// expression analysis is used (indirectly through getTripCount), and
 /// this method is thus able to determine non-trivial divisors.
 uint64_t mlir::getLargestDivisorOfTripCount(AffineForOp forOp) {
   SmallVector<Value, 4> operands;
   AffineMap map;
   getTripCountMapAndOperands(forOp, &map, &operands);

   if (!map)
     return 1;

   // The largest divisor of the trip count is the GCD of the individual largest
   // divisors.
   assert(map.getNumResults() >= 1 && "expected one or more results");
   Optional<uint64_t> gcd;
   for (auto resultExpr : map.getResults()) {
     uint64_t thisGcd;
     if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
       uint64_t tripCount = constExpr.getValue();
       // 0 iteration loops (greatest divisor is 2^64 - 1).
       if (tripCount == 0)
         thisGcd = std::numeric_limits<uint64_t>::max();
       else
         // The greatest divisor is the trip count.
         thisGcd = tripCount;
     } else {
       // Trip count is not a known constant; return its largest known divisor.
       thisGcd = resultExpr.getLargestKnownDivisor();
     }
     if (gcd.hasValue())
       gcd = llvm::GreatestCommonDivisor64(gcd.getValue(), thisGcd);
     else
       gcd = thisGcd;
   }
   assert(gcd.hasValue() && "value expected per above logic");
   return gcd.getValue();
 }

 /// Given an induction variable `iv` of type AffineForOp and an access `index`
 /// of type index, returns `true` if `index` is independent of `iv` and
 /// false otherwise. The determination supports composition with at most one
 /// AffineApplyOp. The 'at most one AffineApplyOp' comes from the fact that
 /// the composition of AffineApplyOp needs to be canonicalized by construction
 /// to avoid writing code that composes arbitrary numbers of AffineApplyOps
 /// everywhere. To achieve this, at the very least, the compose-affine-apply
 /// pass must have been run.
 ///
 /// Prerequisites:
 ///   1. `iv` and `index` of the proper type;
 ///   2. at most one reachable AffineApplyOp from index;
 ///
 /// Returns false in cases with more than one AffineApplyOp, this is
 /// conservative.
 static bool isAccessIndexInvariant(Value iv, Value index) {
   assert(isForInductionVar(iv) && "iv must be a AffineForOp");
   assert(index.getType().isa<IndexType>() && "index must be of IndexType");
   SmallVector<Operation *, 4> affineApplyOps;
   getReachableAffineApplyOps({index}, affineApplyOps);

   if (affineApplyOps.empty()) {
     // Pointer equality test because of Value pointer semantics.
     return index != iv;
   }

   if (affineApplyOps.size() > 1) {
     affineApplyOps[0]->emitRemark(
         "CompositionAffineMapsPass must have been run: there should be at most "
         "one AffineApplyOp, returning false conservatively.");
     return false;
   }

   auto composeOp = cast<AffineApplyOp>(affineApplyOps[0]);
   // We need yet another level of indirection because the `dim` index of the
   // access may not correspond to the `dim` index of composeOp.
   return !composeOp.getAffineValueMap().isFunctionOf(0, iv);
 }

 DenseSet<Value> mlir::getInvariantAccesses(Value iv, ArrayRef<Value> indices) {
   DenseSet<Value> res;
   for (unsigned idx = 0, n = indices.size(); idx < n; ++idx) {
     auto val = indices[idx];
     if (isAccessIndexInvariant(iv, val)) {
       res.insert(val);
     }
   }
   return res;
 }

 /// Given:
 ///   1. an induction variable `iv` of type AffineForOp;
 ///   2. a `memoryOp` of type const LoadOp& or const StoreOp&;
 /// determines whether `memoryOp` has a contiguous access along `iv`. Contiguous
 /// is defined as either invariant or varying only along a unique MemRef dim.
 /// Upon success, the unique MemRef dim is written in `memRefDim` (or -1 to
 /// convey the memRef access is invariant along `iv`).
 ///
 /// Prerequisites:
 ///   1. `memRefDim` ~= nullptr;
 ///   2. `iv` of the proper type;
 ///   3. the MemRef accessed by `memoryOp` has no layout map or at most an
 ///      identity layout map.
 ///
 /// Currently only supports no layoutMap or identity layoutMap in the MemRef.
 /// Returns false if the MemRef has a non-identity layoutMap or more than 1
 /// layoutMap. This is conservative.
 ///
 // TODO: check strides.
 template <typename LoadOrStoreOp>
 static bool isContiguousAccess(Value iv, LoadOrStoreOp memoryOp,
                                int *memRefDim) {
   static_assert(
       llvm::is_one_of<LoadOrStoreOp, AffineLoadOp, AffineStoreOp>::value,
       "Must be called on either LoadOp or StoreOp");
   assert(memRefDim && "memRefDim == nullptr");
   auto memRefType = memoryOp.getMemRefType();

   if (!memRefType.getLayout().isIdentity())
     return memoryOp.emitError("NYI: non-trivial layoutMap"), false;

   int uniqueVaryingIndexAlongIv = -1;
   auto accessMap = memoryOp.getAffineMap();
   SmallVector<Value, 4> mapOperands(memoryOp.getMapOperands());
   unsigned numDims = accessMap.getNumDims();
   for (unsigned i = 0, e = memRefType.getRank(); i < e; ++i) {
     // Gather map operands used result expr 'i' in 'exprOperands'.
     SmallVector<Value, 4> exprOperands;
     auto resultExpr = accessMap.getResult(i);
     resultExpr.walk([&](AffineExpr expr) {
       if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
         exprOperands.push_back(mapOperands[dimExpr.getPosition()]);
       else if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>())
         exprOperands.push_back(mapOperands[numDims + symExpr.getPosition()]);
     });
     // Check access invariance of each operand in 'exprOperands'.
     for (auto exprOperand : exprOperands) {
       if (!isAccessIndexInvariant(iv, exprOperand)) {
         if (uniqueVaryingIndexAlongIv != -1) {
           // 2+ varying indices -> do not vectorize along iv.
           return false;
         }
         uniqueVaryingIndexAlongIv = i;
       }
     }
   }

   if (uniqueVaryingIndexAlongIv == -1)
     *memRefDim = -1;
   else
     *memRefDim = memRefType.getRank() - (uniqueVaryingIndexAlongIv + 1);
   return true;
 }

 template <typename LoadOrStoreOp>
 static bool isVectorElement(LoadOrStoreOp memoryOp) {
   auto memRefType = memoryOp.getMemRefType();
   return memRefType.getElementType().template isa<VectorType>();
 }

 using VectorizableOpFun = std::function<bool(AffineForOp, Operation &)>;

 static bool
 isVectorizableLoopBodyWithOpCond(AffineForOp loop,
                                  VectorizableOpFun isVectorizableOp,
                                  NestedPattern &vectorTransferMatcher) {
   auto *forOp = loop.getOperation();

   // No vectorization across conditionals for now.
   auto conditionals = matcher::If();
   SmallVector<NestedMatch, 8> conditionalsMatched;
   conditionals.match(forOp, &conditionalsMatched);
   if (!conditionalsMatched.empty()) {
     return false;
   }

   // No vectorization across unknown regions.
   auto regions = matcher::Op([](Operation &op) -> bool {
     return op.getNumRegions() != 0 && !isa<AffineIfOp, AffineForOp>(op);
   });
   SmallVector<NestedMatch, 8> regionsMatched;
   regions.match(forOp, &regionsMatched);
   if (!regionsMatched.empty()) {
     return false;
   }

   SmallVector<NestedMatch, 8> vectorTransfersMatched;
   vectorTransferMatcher.match(forOp, &vectorTransfersMatched);
   if (!vectorTransfersMatched.empty()) {
     return false;
   }

   auto loadAndStores = matcher::Op(matcher::isLoadOrStore);
   SmallVector<NestedMatch, 8> loadAndStoresMatched;
   loadAndStores.match(forOp, &loadAndStoresMatched);
   for (auto ls : loadAndStoresMatched) {
     auto *op = ls.getMatchedOperation();
     auto load = dyn_cast<AffineLoadOp>(op);
     auto store = dyn_cast<AffineStoreOp>(op);
     // Only scalar types are considered vectorizable, all load/store must be
     // vectorizable for a loop to qualify as vectorizable.
     // TODO: ponder whether we want to be more general here.
     bool vector = load ? isVectorElement(load) : isVectorElement(store);
     if (vector) {
       return false;
     }
     if (isVectorizableOp && !isVectorizableOp(loop, *op)) {
       return false;
     }
   }
   return true;
 }

 bool mlir::isVectorizableLoopBody(AffineForOp loop, int *memRefDim,
                                   NestedPattern &vectorTransferMatcher) {
   *memRefDim = -1;
   VectorizableOpFun fun([memRefDim](AffineForOp loop, Operation &op) {
     auto load = dyn_cast<AffineLoadOp>(op);
     auto store = dyn_cast<AffineStoreOp>(op);
     int thisOpMemRefDim = -1;
     bool isContiguous = load ? isContiguousAccess(loop.getInductionVar(), load,
                                                   &thisOpMemRefDim)
                              : isContiguousAccess(loop.getInductionVar(), store,
                                                   &thisOpMemRefDim);
     if (thisOpMemRefDim != -1) {
       // If memory accesses vary across different dimensions then the loop is
       // not vectorizable.
       if (*memRefDim != -1 && *memRefDim != thisOpMemRefDim)
         return false;
       *memRefDim = thisOpMemRefDim;
     }
     return isContiguous;
   });
   return isVectorizableLoopBodyWithOpCond(loop, fun, vectorTransferMatcher);
 }

 bool mlir::isVectorizableLoopBody(AffineForOp loop,
                                   NestedPattern &vectorTransferMatcher) {
   return isVectorizableLoopBodyWithOpCond(loop, nullptr, vectorTransferMatcher);
 }

 /// Checks whether SSA dominance would be violated if a for op's body
 /// operations are shifted by the specified shifts. This method checks if a
 /// 'def' and all its uses have the same shift factor.
 // TODO: extend this to check for memory-based dependence violation when we have
 // the support.
 bool mlir::isOpwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts) {
   auto *forBody = forOp.getBody();
   assert(shifts.size() == forBody->getOperations().size());

   // Work backwards over the body of the block so that the shift of a use's
   // ancestor operation in the block gets recorded before it's looked up.
   DenseMap<Operation *, uint64_t> forBodyShift;
   for (auto it : llvm::enumerate(llvm::reverse(forBody->getOperations()))) {
     auto &op = it.value();

     // Get the index of the current operation, note that we are iterating in
     // reverse so we need to fix it up.
     size_t index = shifts.size() - it.index() - 1;

     // Remember the shift of this operation.
     uint64_t shift = shifts[index];
     forBodyShift.try_emplace(&op, shift);

     // Validate the results of this operation if it were to be shifted.
     for (unsigned i = 0, e = op.getNumResults(); i < e; ++i) {
       Value result = op.getResult(i);
       for (auto *user : result.getUsers()) {
         // If an ancestor operation doesn't lie in the block of forOp,
         // there is no shift to check.
         if (auto *ancOp = forBody->findAncestorOpInBlock(*user)) {
           assert(forBodyShift.count(ancOp) > 0 && "ancestor expected in map");
           if (shift != forBodyShift[ancOp])
             return false;
         }
       }
     }
   }
   return true;
 }

 /// Returns true if `value` (transitively) depends on iteration-carried values
 /// of the given `ancestorOp`.
 static bool dependsOnCarriedVals(Value value,
                                  ArrayRef<BlockArgument> iterCarriedArgs,
                                  Operation *ancestorOp) {
   // Compute the backward slice of the value.
   SetVector<Operation *> slice;
   getBackwardSlice(value, &slice,
                    [&](Operation *op) { return !ancestorOp->isAncestor(op); });

   // Check that none of the operands of the operations in the backward slice are
   // loop iteration arguments, and neither is the value itself.
   SmallPtrSet<Value, 8> iterCarriedValSet(iterCarriedArgs.begin(),
                                           iterCarriedArgs.end());
   if (iterCarriedValSet.contains(value))
     return true;

   for (Operation *op : slice)
     for (Value operand : op->getOperands())
       if (iterCarriedValSet.contains(operand))
         return true;

   return false;
 }

 /// Utility to match a generic reduction given a list of iteration-carried
 /// arguments, `iterCarriedArgs` and the position of the potential reduction
 /// argument within the list, `redPos`. If a reduction is matched, returns the
 /// reduced value and the topologically-sorted list of combiner operations
 /// involved in the reduction. Otherwise, returns a null value.
 ///
 /// The matching algorithm relies on the following invariants, which are subject
 /// to change:
 ///  1. The first combiner operation must be a binary operation with the
 ///     iteration-carried value and the reduced value as operands.
 ///  2. The iteration-carried value and combiner operations must be side
 ///     effect-free, have single result and a single use.
 ///  3. Combiner operations must be immediately nested in the region op
 ///     performing the reduction.
 ///  4. Reduction def-use chain must end in a terminator op that yields the
 ///     next iteration/output values in the same order as the iteration-carried
 ///     values in `iterCarriedArgs`.
 ///  5. `iterCarriedArgs` must contain all the iteration-carried/output values
 ///     of the region op performing the reduction.
 ///
 /// This utility is generic enough to detect reductions involving multiple
 /// combiner operations (disabled for now) across multiple dialects, including
 /// Linalg, Affine and SCF. For the sake of genericity, it does not return
 /// specific enum values for the combiner operations since its goal is also
 /// matching reductions without pre-defined semantics in core MLIR. It's up to
 /// each client to make sense out of the list of combiner operations. It's also
 /// up to each client to check for additional invariants on the expected
 /// reductions not covered by this generic matching.
 Value mlir::matchReduction(ArrayRef<BlockArgument> iterCarriedArgs,
                            unsigned redPos,
                            SmallVectorImpl<Operation *> &combinerOps) {
   assert(redPos < iterCarriedArgs.size() && "'redPos' is out of bounds");

   BlockArgument redCarriedVal = iterCarriedArgs[redPos];
   if (!redCarriedVal.hasOneUse())
     return nullptr;

   // For now, the first combiner op must be a binary op.
   Operation *combinerOp = *redCarriedVal.getUsers().begin();
   if (combinerOp->getNumOperands() != 2)
     return nullptr;
   Value reducedVal = combinerOp->getOperand(0) == redCarriedVal
                          ? combinerOp->getOperand(1)
                          : combinerOp->getOperand(0);

   Operation *redRegionOp =
       iterCarriedArgs.front().getOwner()->getParent()->getParentOp();
   if (dependsOnCarriedVals(reducedVal, iterCarriedArgs, redRegionOp))
     return nullptr;

   // Traverse the def-use chain starting from the first combiner op until a
   // terminator is found. Gather all the combiner ops along the way in
   // topological order.
   while (!combinerOp->mightHaveTrait<OpTrait::IsTerminator>()) {
     if (!MemoryEffectOpInterface::hasNoEffect(combinerOp) ||
         combinerOp->getNumResults() != 1 || !combinerOp->hasOneUse() ||
         combinerOp->getParentOp() != redRegionOp)
       return nullptr;

     combinerOps.push_back(combinerOp);
     combinerOp = *combinerOp->getUsers().begin();
   }

   // Limit matching to single combiner op until we can properly test reductions
   // involving multiple combiners.
   if (combinerOps.size() != 1)
     return nullptr;

   // Check that the yielded value is in the same position as in
   // `iterCarriedArgs`.
   Operation *terminatorOp = combinerOp;
   if (terminatorOp->getOperand(redPos) != combinerOps.back()->getResults()[0])
     return nullptr;

   return reducedVal;
 }
	//===- LoopAnalysis.cpp - Misc loop analysis routines //-------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements miscellaneous loop analysis routines.
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Analysis/LoopAnalysis.h"

	#include "mlir/Analysis/AffineAnalysis.h"
	#include "mlir/Analysis/AffineStructures.h"
	#include "mlir/Analysis/NestedMatcher.h"
	#include "mlir/Analysis/SliceAnalysis.h"
	#include "mlir/Dialect/Affine/IR/AffineOps.h"
	#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
	#include "mlir/Support/MathExtras.h"

	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallString.h"
	#include <type_traits>

	using namespace mlir;

	/// Returns the trip count of the loop as an affine expression if the latter is
	/// expressible as an affine expression, and nullptr otherwise. The trip count
	/// expression is simplified before returning. This method only utilizes map
	/// composition to construct lower and upper bounds before computing the trip
	/// count expressions.
	void mlir::getTripCountMapAndOperands(
	AffineForOp forOp, AffineMap *tripCountMap,
	SmallVectorImpl<Value> *tripCountOperands) {
	MLIRContext *context = forOp.getContext();
	int64_t step = forOp.getStep();
	int64_t loopSpan;
	if (forOp.hasConstantBounds()) {
	int64_t lb = forOp.getConstantLowerBound();
	int64_t ub = forOp.getConstantUpperBound();
	loopSpan = ub - lb;
	if (loopSpan < 0)
	loopSpan = 0;
	*tripCountMap = AffineMap::getConstantMap(ceilDiv(loopSpan, step), context);
	tripCountOperands->clear();
	return;
	}
	auto lbMap = forOp.getLowerBoundMap();
	auto ubMap = forOp.getUpperBoundMap();
	if (lbMap.getNumResults() != 1) {
	*tripCountMap = AffineMap();
	return;
	}

	// Difference of each upper bound expression from the single lower bound
	// expression (divided by the step) provides the expressions for the trip
	// count map.
	AffineValueMap ubValueMap(ubMap, forOp.getUpperBoundOperands());

	SmallVector<AffineExpr, 4> lbSplatExpr(ubValueMap.getNumResults(),
	lbMap.getResult(0));
	auto lbMapSplat = AffineMap::get(lbMap.getNumDims(), lbMap.getNumSymbols(),
	lbSplatExpr, context);
	AffineValueMap lbSplatValueMap(lbMapSplat, forOp.getLowerBoundOperands());

	AffineValueMap tripCountValueMap;
	AffineValueMap::difference(ubValueMap, lbSplatValueMap, &tripCountValueMap);
	for (unsigned i = 0, e = tripCountValueMap.getNumResults(); i < e; ++i)
	tripCountValueMap.setResult(i,
	tripCountValueMap.getResult(i).ceilDiv(step));

	*tripCountMap = tripCountValueMap.getAffineMap();
	tripCountOperands->assign(tripCountValueMap.getOperands().begin(),
	tripCountValueMap.getOperands().end());
	}

	/// Returns the trip count of the loop if it's a constant, None otherwise. This
	/// method uses affine expression analysis (in turn using getTripCount) and is
	/// able to determine constant trip count in non-trivial cases.
	Optional<uint64_t> mlir::getConstantTripCount(AffineForOp forOp) {
	SmallVector<Value, 4> operands;
	AffineMap map;
	getTripCountMapAndOperands(forOp, &map, &operands);

	if (!map)
	return None;

	// Take the min if all trip counts are constant.
	Optional<uint64_t> tripCount;
	for (auto resultExpr : map.getResults()) {
	if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
	if (tripCount.hasValue())
	tripCount = std::min(tripCount.getValue(),
	static_cast<uint64_t>(constExpr.getValue()));
	else
	tripCount = constExpr.getValue();
	} else
	return None;
	}
	return tripCount;
	}

	/// Returns the greatest known integral divisor of the trip count. Affine
	/// expression analysis is used (indirectly through getTripCount), and
	/// this method is thus able to determine non-trivial divisors.
	uint64_t mlir::getLargestDivisorOfTripCount(AffineForOp forOp) {
	SmallVector<Value, 4> operands;
	AffineMap map;
	getTripCountMapAndOperands(forOp, &map, &operands);

	if (!map)
	return 1;

	// The largest divisor of the trip count is the GCD of the individual largest
	// divisors.
	assert(map.getNumResults() >= 1 && "expected one or more results");
	Optional<uint64_t> gcd;
	for (auto resultExpr : map.getResults()) {
	uint64_t thisGcd;
	if (auto constExpr = resultExpr.dyn_cast<AffineConstantExpr>()) {
	uint64_t tripCount = constExpr.getValue();
	// 0 iteration loops (greatest divisor is 2^64 - 1).
	if (tripCount == 0)
	thisGcd = std::numeric_limits<uint64_t>::max();
	else
	// The greatest divisor is the trip count.
	thisGcd = tripCount;
	} else {
	// Trip count is not a known constant; return its largest known divisor.
	thisGcd = resultExpr.getLargestKnownDivisor();
	}
	if (gcd.hasValue())
	gcd = llvm::GreatestCommonDivisor64(gcd.getValue(), thisGcd);
	else
	gcd = thisGcd;
	}
	assert(gcd.hasValue() && "value expected per above logic");
	return gcd.getValue();
	}

	/// Given an induction variable `iv` of type AffineForOp and an access `index`
	/// of type index, returns `true` if `index` is independent of `iv` and
	/// false otherwise. The determination supports composition with at most one
	/// AffineApplyOp. The 'at most one AffineApplyOp' comes from the fact that
	/// the composition of AffineApplyOp needs to be canonicalized by construction
	/// to avoid writing code that composes arbitrary numbers of AffineApplyOps
	/// everywhere. To achieve this, at the very least, the compose-affine-apply
	/// pass must have been run.
	///
	/// Prerequisites:
	/// 1. `iv` and `index` of the proper type;
	/// 2. at most one reachable AffineApplyOp from index;
	///
	/// Returns false in cases with more than one AffineApplyOp, this is
	/// conservative.
	static bool isAccessIndexInvariant(Value iv, Value index) {
	assert(isForInductionVar(iv) && "iv must be a AffineForOp");
	assert(index.getType().isa<IndexType>() && "index must be of IndexType");
	SmallVector<Operation *, 4> affineApplyOps;
	getReachableAffineApplyOps({index}, affineApplyOps);

	if (affineApplyOps.empty()) {
	// Pointer equality test because of Value pointer semantics.
	return index != iv;
	}

	if (affineApplyOps.size() > 1) {
	affineApplyOps[0]->emitRemark(
	"CompositionAffineMapsPass must have been run: there should be at most "
	"one AffineApplyOp, returning false conservatively.");
	return false;
	}

	auto composeOp = cast<AffineApplyOp>(affineApplyOps[0]);
	// We need yet another level of indirection because the `dim` index of the
	// access may not correspond to the `dim` index of composeOp.
	return !composeOp.getAffineValueMap().isFunctionOf(0, iv);
	}

	DenseSet<Value> mlir::getInvariantAccesses(Value iv, ArrayRef<Value> indices) {
	DenseSet<Value> res;
	for (unsigned idx = 0, n = indices.size(); idx < n; ++idx) {
	auto val = indices[idx];
	if (isAccessIndexInvariant(iv, val)) {
	res.insert(val);
	}
	}
	return res;
	}

	/// Given:
	/// 1. an induction variable `iv` of type AffineForOp;
	/// 2. a `memoryOp` of type const LoadOp& or const StoreOp&;
	/// determines whether `memoryOp` has a contiguous access along `iv`. Contiguous
	/// is defined as either invariant or varying only along a unique MemRef dim.
	/// Upon success, the unique MemRef dim is written in `memRefDim` (or -1 to
	/// convey the memRef access is invariant along `iv`).
	///
	/// Prerequisites:
	/// 1. `memRefDim` ~= nullptr;
	/// 2. `iv` of the proper type;
	/// 3. the MemRef accessed by `memoryOp` has no layout map or at most an
	/// identity layout map.
	///
	/// Currently only supports no layoutMap or identity layoutMap in the MemRef.
	/// Returns false if the MemRef has a non-identity layoutMap or more than 1
	/// layoutMap. This is conservative.
	///
	// TODO: check strides.
	template <typename LoadOrStoreOp>
	static bool isContiguousAccess(Value iv, LoadOrStoreOp memoryOp,
	int *memRefDim) {
	static_assert(
	llvm::is_one_of<LoadOrStoreOp, AffineLoadOp, AffineStoreOp>::value,
	"Must be called on either LoadOp or StoreOp");
	assert(memRefDim && "memRefDim == nullptr");
	auto memRefType = memoryOp.getMemRefType();

	if (!memRefType.getLayout().isIdentity())
	return memoryOp.emitError("NYI: non-trivial layoutMap"), false;

	int uniqueVaryingIndexAlongIv = -1;
	auto accessMap = memoryOp.getAffineMap();
	SmallVector<Value, 4> mapOperands(memoryOp.getMapOperands());
	unsigned numDims = accessMap.getNumDims();
	for (unsigned i = 0, e = memRefType.getRank(); i < e; ++i) {
	// Gather map operands used result expr 'i' in 'exprOperands'.
	SmallVector<Value, 4> exprOperands;
	auto resultExpr = accessMap.getResult(i);
	resultExpr.walk([&](AffineExpr expr) {
	if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
	exprOperands.push_back(mapOperands[dimExpr.getPosition()]);
	else if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>())
	exprOperands.push_back(mapOperands[numDims + symExpr.getPosition()]);
	});
	// Check access invariance of each operand in 'exprOperands'.
	for (auto exprOperand : exprOperands) {
	if (!isAccessIndexInvariant(iv, exprOperand)) {
	if (uniqueVaryingIndexAlongIv != -1) {
	// 2+ varying indices -> do not vectorize along iv.
	return false;
	}
	uniqueVaryingIndexAlongIv = i;
	}
	}
	}

	if (uniqueVaryingIndexAlongIv == -1)
	*memRefDim = -1;
	else
	*memRefDim = memRefType.getRank() - (uniqueVaryingIndexAlongIv + 1);
	return true;
	}

	template <typename LoadOrStoreOp>
	static bool isVectorElement(LoadOrStoreOp memoryOp) {
	auto memRefType = memoryOp.getMemRefType();
	return memRefType.getElementType().template isa<VectorType>();
	}

	using VectorizableOpFun = std::function<bool(AffineForOp, Operation &)>;

	static bool
	isVectorizableLoopBodyWithOpCond(AffineForOp loop,
	VectorizableOpFun isVectorizableOp,
	NestedPattern &vectorTransferMatcher) {
	auto *forOp = loop.getOperation();

	// No vectorization across conditionals for now.
	auto conditionals = matcher::If();
	SmallVector<NestedMatch, 8> conditionalsMatched;
	conditionals.match(forOp, &conditionalsMatched);
	if (!conditionalsMatched.empty()) {
	return false;
	}

	// No vectorization across unknown regions.
	auto regions = matcher::Op([](Operation &op) -> bool {
	return op.getNumRegions() != 0 && !isa<AffineIfOp, AffineForOp>(op);
	});
	SmallVector<NestedMatch, 8> regionsMatched;
	regions.match(forOp, &regionsMatched);
	if (!regionsMatched.empty()) {
	return false;
	}

	SmallVector<NestedMatch, 8> vectorTransfersMatched;
	vectorTransferMatcher.match(forOp, &vectorTransfersMatched);
	if (!vectorTransfersMatched.empty()) {
	return false;
	}

	auto loadAndStores = matcher::Op(matcher::isLoadOrStore);
	SmallVector<NestedMatch, 8> loadAndStoresMatched;
	loadAndStores.match(forOp, &loadAndStoresMatched);
	for (auto ls : loadAndStoresMatched) {
	auto *op = ls.getMatchedOperation();
	auto load = dyn_cast<AffineLoadOp>(op);
	auto store = dyn_cast<AffineStoreOp>(op);
	// Only scalar types are considered vectorizable, all load/store must be
	// vectorizable for a loop to qualify as vectorizable.
	// TODO: ponder whether we want to be more general here.
	bool vector = load ? isVectorElement(load) : isVectorElement(store);
	if (vector) {
	return false;
	}
	if (isVectorizableOp && !isVectorizableOp(loop, *op)) {
	return false;
	}
	}
	return true;
	}

	bool mlir::isVectorizableLoopBody(AffineForOp loop, int *memRefDim,
	NestedPattern &vectorTransferMatcher) {
	*memRefDim = -1;
	VectorizableOpFun fun([memRefDim](AffineForOp loop, Operation &op) {
	auto load = dyn_cast<AffineLoadOp>(op);
	auto store = dyn_cast<AffineStoreOp>(op);
	int thisOpMemRefDim = -1;
	bool isContiguous = load ? isContiguousAccess(loop.getInductionVar(), load,
	&thisOpMemRefDim)
	: isContiguousAccess(loop.getInductionVar(), store,
	&thisOpMemRefDim);
	if (thisOpMemRefDim != -1) {
	// If memory accesses vary across different dimensions then the loop is
	// not vectorizable.
	if (memRefDim != -1 && memRefDim != thisOpMemRefDim)
	return false;
	*memRefDim = thisOpMemRefDim;
	}
	return isContiguous;
	});
	return isVectorizableLoopBodyWithOpCond(loop, fun, vectorTransferMatcher);
	}

	bool mlir::isVectorizableLoopBody(AffineForOp loop,
	NestedPattern &vectorTransferMatcher) {
	return isVectorizableLoopBodyWithOpCond(loop, nullptr, vectorTransferMatcher);
	}

	/// Checks whether SSA dominance would be violated if a for op's body
	/// operations are shifted by the specified shifts. This method checks if a
	/// 'def' and all its uses have the same shift factor.
	// TODO: extend this to check for memory-based dependence violation when we have
	// the support.
	bool mlir::isOpwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts) {
	auto *forBody = forOp.getBody();
	assert(shifts.size() == forBody->getOperations().size());

	// Work backwards over the body of the block so that the shift of a use's
	// ancestor operation in the block gets recorded before it's looked up.
	DenseMap<Operation *, uint64_t> forBodyShift;
	for (auto it : llvm::enumerate(llvm::reverse(forBody->getOperations()))) {
	auto &op = it.value();

	// Get the index of the current operation, note that we are iterating in
	// reverse so we need to fix it up.
	size_t index = shifts.size() - it.index() - 1;

	// Remember the shift of this operation.
	uint64_t shift = shifts[index];
	forBodyShift.try_emplace(&op, shift);

	// Validate the results of this operation if it were to be shifted.
	for (unsigned i = 0, e = op.getNumResults(); i < e; ++i) {
	Value result = op.getResult(i);
	for (auto *user : result.getUsers()) {
	// If an ancestor operation doesn't lie in the block of forOp,
	// there is no shift to check.
	if (auto ancOp = forBody->findAncestorOpInBlock(user)) {
	assert(forBodyShift.count(ancOp) > 0 && "ancestor expected in map");
	if (shift != forBodyShift[ancOp])
	return false;
	}
	}
	}
	}
	return true;
	}

	/// Returns true if `value` (transitively) depends on iteration-carried values
	/// of the given `ancestorOp`.
	static bool dependsOnCarriedVals(Value value,
	ArrayRef<BlockArgument> iterCarriedArgs,
	Operation *ancestorOp) {
	// Compute the backward slice of the value.
	SetVector<Operation *> slice;
	getBackwardSlice(value, &slice,
	[&](Operation *op) { return !ancestorOp->isAncestor(op); });

	// Check that none of the operands of the operations in the backward slice are
	// loop iteration arguments, and neither is the value itself.
	SmallPtrSet<Value, 8> iterCarriedValSet(iterCarriedArgs.begin(),
	iterCarriedArgs.end());
	if (iterCarriedValSet.contains(value))
	return true;

	for (Operation *op : slice)
	for (Value operand : op->getOperands())
	if (iterCarriedValSet.contains(operand))
	return true;

	return false;
	}

	/// Utility to match a generic reduction given a list of iteration-carried
	/// arguments, `iterCarriedArgs` and the position of the potential reduction
	/// argument within the list, `redPos`. If a reduction is matched, returns the
	/// reduced value and the topologically-sorted list of combiner operations
	/// involved in the reduction. Otherwise, returns a null value.
	///
	/// The matching algorithm relies on the following invariants, which are subject
	/// to change:
	/// 1. The first combiner operation must be a binary operation with the
	/// iteration-carried value and the reduced value as operands.
	/// 2. The iteration-carried value and combiner operations must be side
	/// effect-free, have single result and a single use.
	/// 3. Combiner operations must be immediately nested in the region op
	/// performing the reduction.
	/// 4. Reduction def-use chain must end in a terminator op that yields the
	/// next iteration/output values in the same order as the iteration-carried
	/// values in `iterCarriedArgs`.
	/// 5. `iterCarriedArgs` must contain all the iteration-carried/output values
	/// of the region op performing the reduction.
	///
	/// This utility is generic enough to detect reductions involving multiple
	/// combiner operations (disabled for now) across multiple dialects, including
	/// Linalg, Affine and SCF. For the sake of genericity, it does not return
	/// specific enum values for the combiner operations since its goal is also
	/// matching reductions without pre-defined semantics in core MLIR. It's up to
	/// each client to make sense out of the list of combiner operations. It's also
	/// up to each client to check for additional invariants on the expected
	/// reductions not covered by this generic matching.
	Value mlir::matchReduction(ArrayRef<BlockArgument> iterCarriedArgs,
	unsigned redPos,
	SmallVectorImpl<Operation *> &combinerOps) {
	assert(redPos < iterCarriedArgs.size() && "'redPos' is out of bounds");

	BlockArgument redCarriedVal = iterCarriedArgs[redPos];
	if (!redCarriedVal.hasOneUse())
	return nullptr;

	// For now, the first combiner op must be a binary op.
	Operation combinerOp = redCarriedVal.getUsers().begin();
	if (combinerOp->getNumOperands() != 2)
	return nullptr;
	Value reducedVal = combinerOp->getOperand(0) == redCarriedVal
	? combinerOp->getOperand(1)
	: combinerOp->getOperand(0);

	Operation *redRegionOp =
	iterCarriedArgs.front().getOwner()->getParent()->getParentOp();
	if (dependsOnCarriedVals(reducedVal, iterCarriedArgs, redRegionOp))
	return nullptr;

	// Traverse the def-use chain starting from the first combiner op until a
	// terminator is found. Gather all the combiner ops along the way in
	// topological order.
	while (!combinerOp->mightHaveTrait<OpTrait::IsTerminator>()) {
	if (!MemoryEffectOpInterface::hasNoEffect(combinerOp) \|\|
	combinerOp->getNumResults() != 1 \|\| !combinerOp->hasOneUse() \|\|
	combinerOp->getParentOp() != redRegionOp)
	return nullptr;

	combinerOps.push_back(combinerOp);
	combinerOp = *combinerOp->getUsers().begin();
	}

	// Limit matching to single combiner op until we can properly test reductions
	// involving multiple combiners.
	if (combinerOps.size() != 1)
	return nullptr;

	// Check that the yielded value is in the same position as in
	// `iterCarriedArgs`.
	Operation *terminatorOp = combinerOp;
	if (terminatorOp->getOperand(redPos) != combinerOps.back()->getResults()[0])
	return nullptr;

	return reducedVal;
	}