| //===- Utils.cpp ---- Utilities for affine dialect transformation ---------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements miscellaneous transformation utilities for the Affine |
| // dialect. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "mlir/Dialect/Affine/Utils.h" |
| |
| #include "mlir/Dialect/Affine/Analysis/Utils.h" |
| #include "mlir/Dialect/Affine/IR/AffineOps.h" |
| #include "mlir/Dialect/Affine/IR/AffineValueMap.h" |
| #include "mlir/Dialect/Affine/LoopUtils.h" |
| #include "mlir/Dialect/Arith/Utils/Utils.h" |
| #include "mlir/Dialect/Func/IR/FuncOps.h" |
| #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| #include "mlir/Dialect/Utils/IndexingUtils.h" |
| #include "mlir/IR/AffineExprVisitor.h" |
| #include "mlir/IR/Dominance.h" |
| #include "mlir/IR/IRMapping.h" |
| #include "mlir/IR/ImplicitLocOpBuilder.h" |
| #include "mlir/IR/IntegerSet.h" |
| #include "mlir/Transforms/GreedyPatternRewriteDriver.h" |
| #include "llvm/Support/LogicalResult.h" |
| #include <optional> |
| |
| #define DEBUG_TYPE "affine-utils" |
| |
| using namespace mlir; |
| using namespace affine; |
| using namespace presburger; |
| |
| namespace { |
| /// Visit affine expressions recursively and build the sequence of operations |
| /// that correspond to it. Visitation functions return an Value of the |
| /// expression subtree they visited or `nullptr` on error. |
| class AffineApplyExpander |
| : public AffineExprVisitor<AffineApplyExpander, Value> { |
| public: |
| /// This internal class expects arguments to be non-null, checks must be |
| /// performed at the call site. |
| AffineApplyExpander(OpBuilder &builder, ValueRange dimValues, |
| ValueRange symbolValues, Location loc) |
| : builder(builder), dimValues(dimValues), symbolValues(symbolValues), |
| loc(loc) {} |
| |
| template <typename OpTy> |
| Value buildBinaryExpr(AffineBinaryOpExpr expr, |
| arith::IntegerOverflowFlags overflowFlags = |
| arith::IntegerOverflowFlags::none) { |
| auto lhs = visit(expr.getLHS()); |
| auto rhs = visit(expr.getRHS()); |
| if (!lhs || !rhs) |
| return nullptr; |
| auto op = builder.create<OpTy>(loc, lhs, rhs, overflowFlags); |
| return op.getResult(); |
| } |
| |
| Value visitAddExpr(AffineBinaryOpExpr expr) { |
| return buildBinaryExpr<arith::AddIOp>(expr); |
| } |
| |
| Value visitMulExpr(AffineBinaryOpExpr expr) { |
| return buildBinaryExpr<arith::MulIOp>(expr, |
| arith::IntegerOverflowFlags::nsw); |
| } |
| |
| /// Euclidean modulo operation: negative RHS is not allowed. |
| /// Remainder of the euclidean integer division is always non-negative. |
| /// |
| /// Implemented as |
| /// |
| /// a mod b = |
| /// let remainder = srem a, b; |
| /// negative = a < 0 in |
| /// select negative, remainder + b, remainder. |
| Value visitModExpr(AffineBinaryOpExpr expr) { |
| if (auto rhsConst = dyn_cast<AffineConstantExpr>(expr.getRHS())) { |
| if (rhsConst.getValue() <= 0) { |
| emitError(loc, "modulo by non-positive value is not supported"); |
| return nullptr; |
| } |
| } |
| |
| auto lhs = visit(expr.getLHS()); |
| auto rhs = visit(expr.getRHS()); |
| assert(lhs && rhs && "unexpected affine expr lowering failure"); |
| |
| Value remainder = builder.create<arith::RemSIOp>(loc, lhs, rhs); |
| Value zeroCst = builder.create<arith::ConstantIndexOp>(loc, 0); |
| Value isRemainderNegative = builder.create<arith::CmpIOp>( |
| loc, arith::CmpIPredicate::slt, remainder, zeroCst); |
| Value correctedRemainder = |
| builder.create<arith::AddIOp>(loc, remainder, rhs); |
| Value result = builder.create<arith::SelectOp>( |
| loc, isRemainderNegative, correctedRemainder, remainder); |
| return result; |
| } |
| |
| /// Floor division operation (rounds towards negative infinity). |
| /// |
| /// For positive divisors, it can be implemented without branching and with a |
| /// single division operation as |
| /// |
| /// a floordiv b = |
| /// let negative = a < 0 in |
| /// let absolute = negative ? -a - 1 : a in |
| /// let quotient = absolute / b in |
| /// negative ? -quotient - 1 : quotient |
| /// |
| /// Note: this lowering does not use arith.floordivsi because the lowering of |
| /// that to arith.divsi (see populateCeilFloorDivExpandOpsPatterns) generates |
| /// not one but two arith.divsi. That could be changed to one divsi, but one |
| /// way or another, going through arith.floordivsi will result in more complex |
| /// IR because arith.floordivsi is more general than affine floordiv in that |
| /// it supports negative RHS. |
| Value visitFloorDivExpr(AffineBinaryOpExpr expr) { |
| if (auto rhsConst = dyn_cast<AffineConstantExpr>(expr.getRHS())) { |
| if (rhsConst.getValue() <= 0) { |
| emitError(loc, "division by non-positive value is not supported"); |
| return nullptr; |
| } |
| } |
| auto lhs = visit(expr.getLHS()); |
| auto rhs = visit(expr.getRHS()); |
| assert(lhs && rhs && "unexpected affine expr lowering failure"); |
| |
| Value zeroCst = builder.create<arith::ConstantIndexOp>(loc, 0); |
| Value noneCst = builder.create<arith::ConstantIndexOp>(loc, -1); |
| Value negative = builder.create<arith::CmpIOp>( |
| loc, arith::CmpIPredicate::slt, lhs, zeroCst); |
| Value negatedDecremented = builder.create<arith::SubIOp>(loc, noneCst, lhs); |
| Value dividend = |
| builder.create<arith::SelectOp>(loc, negative, negatedDecremented, lhs); |
| Value quotient = builder.create<arith::DivSIOp>(loc, dividend, rhs); |
| Value correctedQuotient = |
| builder.create<arith::SubIOp>(loc, noneCst, quotient); |
| Value result = builder.create<arith::SelectOp>(loc, negative, |
| correctedQuotient, quotient); |
| return result; |
| } |
| |
| /// Ceiling division operation (rounds towards positive infinity). |
| /// |
| /// For positive divisors, it can be implemented without branching and with a |
| /// single division operation as |
| /// |
| /// a ceildiv b = |
| /// let negative = a <= 0 in |
| /// let absolute = negative ? -a : a - 1 in |
| /// let quotient = absolute / b in |
| /// negative ? -quotient : quotient + 1 |
| /// |
| /// Note: not using arith.ceildivsi for the same reason as explained in the |
| /// visitFloorDivExpr comment. |
| Value visitCeilDivExpr(AffineBinaryOpExpr expr) { |
| if (auto rhsConst = dyn_cast<AffineConstantExpr>(expr.getRHS())) { |
| if (rhsConst.getValue() <= 0) { |
| emitError(loc, "division by non-positive value is not supported"); |
| return nullptr; |
| } |
| } |
| auto lhs = visit(expr.getLHS()); |
| auto rhs = visit(expr.getRHS()); |
| assert(lhs && rhs && "unexpected affine expr lowering failure"); |
| |
| Value zeroCst = builder.create<arith::ConstantIndexOp>(loc, 0); |
| Value oneCst = builder.create<arith::ConstantIndexOp>(loc, 1); |
| Value nonPositive = builder.create<arith::CmpIOp>( |
| loc, arith::CmpIPredicate::sle, lhs, zeroCst); |
| Value negated = builder.create<arith::SubIOp>(loc, zeroCst, lhs); |
| Value decremented = builder.create<arith::SubIOp>(loc, lhs, oneCst); |
| Value dividend = |
| builder.create<arith::SelectOp>(loc, nonPositive, negated, decremented); |
| Value quotient = builder.create<arith::DivSIOp>(loc, dividend, rhs); |
| Value negatedQuotient = |
| builder.create<arith::SubIOp>(loc, zeroCst, quotient); |
| Value incrementedQuotient = |
| builder.create<arith::AddIOp>(loc, quotient, oneCst); |
| Value result = builder.create<arith::SelectOp>( |
| loc, nonPositive, negatedQuotient, incrementedQuotient); |
| return result; |
| } |
| |
| Value visitConstantExpr(AffineConstantExpr expr) { |
| auto op = builder.create<arith::ConstantIndexOp>(loc, expr.getValue()); |
| return op.getResult(); |
| } |
| |
| Value visitDimExpr(AffineDimExpr expr) { |
| assert(expr.getPosition() < dimValues.size() && |
| "affine dim position out of range"); |
| return dimValues[expr.getPosition()]; |
| } |
| |
| Value visitSymbolExpr(AffineSymbolExpr expr) { |
| assert(expr.getPosition() < symbolValues.size() && |
| "symbol dim position out of range"); |
| return symbolValues[expr.getPosition()]; |
| } |
| |
| private: |
| OpBuilder &builder; |
| ValueRange dimValues; |
| ValueRange symbolValues; |
| |
| Location loc; |
| }; |
| } // namespace |
| |
| /// Create a sequence of operations that implement the `expr` applied to the |
| /// given dimension and symbol values. |
| mlir::Value mlir::affine::expandAffineExpr(OpBuilder &builder, Location loc, |
| AffineExpr expr, |
| ValueRange dimValues, |
| ValueRange symbolValues) { |
| return AffineApplyExpander(builder, dimValues, symbolValues, loc).visit(expr); |
| } |
| |
| /// Create a sequence of operations that implement the `affineMap` applied to |
| /// the given `operands` (as it it were an AffineApplyOp). |
| std::optional<SmallVector<Value, 8>> |
| mlir::affine::expandAffineMap(OpBuilder &builder, Location loc, |
| AffineMap affineMap, ValueRange operands) { |
| auto numDims = affineMap.getNumDims(); |
| auto expanded = llvm::to_vector<8>( |
| llvm::map_range(affineMap.getResults(), |
| [numDims, &builder, loc, operands](AffineExpr expr) { |
| return expandAffineExpr(builder, loc, expr, |
| operands.take_front(numDims), |
| operands.drop_front(numDims)); |
| })); |
| if (llvm::all_of(expanded, [](Value v) { return v; })) |
| return expanded; |
| return std::nullopt; |
| } |
| |
| /// Promotes the `then` or the `else` block of `ifOp` (depending on whether |
| /// `elseBlock` is false or true) into `ifOp`'s containing block, and discards |
| /// the rest of the op. |
| static void promoteIfBlock(AffineIfOp ifOp, bool elseBlock) { |
| if (elseBlock) |
| assert(ifOp.hasElse() && "else block expected"); |
| |
| Block *destBlock = ifOp->getBlock(); |
| Block *srcBlock = elseBlock ? ifOp.getElseBlock() : ifOp.getThenBlock(); |
| destBlock->getOperations().splice( |
| Block::iterator(ifOp), srcBlock->getOperations(), srcBlock->begin(), |
| std::prev(srcBlock->end())); |
| ifOp.erase(); |
| } |
| |
| /// Returns the outermost affine.for/parallel op that the `ifOp` is invariant |
| /// on. The `ifOp` could be hoisted and placed right before such an operation. |
| /// This method assumes that the ifOp has been canonicalized (to be correct and |
| /// effective). |
| static Operation *getOutermostInvariantForOp(AffineIfOp ifOp) { |
| // Walk up the parents past all for op that this conditional is invariant on. |
| auto ifOperands = ifOp.getOperands(); |
| Operation *res = ifOp; |
| while (!res->getParentOp()->hasTrait<OpTrait::IsIsolatedFromAbove>()) { |
| auto *parentOp = res->getParentOp(); |
| if (auto forOp = dyn_cast<AffineForOp>(parentOp)) { |
| if (llvm::is_contained(ifOperands, forOp.getInductionVar())) |
| break; |
| } else if (auto parallelOp = dyn_cast<AffineParallelOp>(parentOp)) { |
| if (llvm::any_of(parallelOp.getIVs(), [&](Value iv) { |
| return llvm::is_contained(ifOperands, iv); |
| })) |
| break; |
| } else if (!isa<AffineIfOp>(parentOp)) { |
| // Won't walk up past anything other than affine.for/if ops. |
| break; |
| } |
| // You can always hoist up past any affine.if ops. |
| res = parentOp; |
| } |
| return res; |
| } |
| |
| /// A helper for the mechanics of mlir::hoistAffineIfOp. Hoists `ifOp` just over |
| /// `hoistOverOp`. Returns the new hoisted op if any hoisting happened, |
| /// otherwise the same `ifOp`. |
| static AffineIfOp hoistAffineIfOp(AffineIfOp ifOp, Operation *hoistOverOp) { |
| // No hoisting to do. |
| if (hoistOverOp == ifOp) |
| return ifOp; |
| |
| // Create the hoisted 'if' first. Then, clone the op we are hoisting over for |
| // the else block. Then drop the else block of the original 'if' in the 'then' |
| // branch while promoting its then block, and analogously drop the 'then' |
| // block of the original 'if' from the 'else' branch while promoting its else |
| // block. |
| IRMapping operandMap; |
| OpBuilder b(hoistOverOp); |
| auto hoistedIfOp = b.create<AffineIfOp>(ifOp.getLoc(), ifOp.getIntegerSet(), |
| ifOp.getOperands(), |
| /*elseBlock=*/true); |
| |
| // Create a clone of hoistOverOp to use for the else branch of the hoisted |
| // conditional. The else block may get optimized away if empty. |
| Operation *hoistOverOpClone = nullptr; |
| // We use this unique name to identify/find `ifOp`'s clone in the else |
| // version. |
| StringAttr idForIfOp = b.getStringAttr("__mlir_if_hoisting"); |
| operandMap.clear(); |
| b.setInsertionPointAfter(hoistOverOp); |
| // We'll set an attribute to identify this op in a clone of this sub-tree. |
| ifOp->setAttr(idForIfOp, b.getBoolAttr(true)); |
| hoistOverOpClone = b.clone(*hoistOverOp, operandMap); |
| |
| // Promote the 'then' block of the original affine.if in the then version. |
| promoteIfBlock(ifOp, /*elseBlock=*/false); |
| |
| // Move the then version to the hoisted if op's 'then' block. |
| auto *thenBlock = hoistedIfOp.getThenBlock(); |
| thenBlock->getOperations().splice(thenBlock->begin(), |
| hoistOverOp->getBlock()->getOperations(), |
| Block::iterator(hoistOverOp)); |
| |
| // Find the clone of the original affine.if op in the else version. |
| AffineIfOp ifCloneInElse; |
| hoistOverOpClone->walk([&](AffineIfOp ifClone) { |
| if (!ifClone->getAttr(idForIfOp)) |
| return WalkResult::advance(); |
| ifCloneInElse = ifClone; |
| return WalkResult::interrupt(); |
| }); |
| assert(ifCloneInElse && "if op clone should exist"); |
| // For the else block, promote the else block of the original 'if' if it had |
| // one; otherwise, the op itself is to be erased. |
| if (!ifCloneInElse.hasElse()) |
| ifCloneInElse.erase(); |
| else |
| promoteIfBlock(ifCloneInElse, /*elseBlock=*/true); |
| |
| // Move the else version into the else block of the hoisted if op. |
| auto *elseBlock = hoistedIfOp.getElseBlock(); |
| elseBlock->getOperations().splice( |
| elseBlock->begin(), hoistOverOpClone->getBlock()->getOperations(), |
| Block::iterator(hoistOverOpClone)); |
| |
| return hoistedIfOp; |
| } |
| |
| LogicalResult |
| mlir::affine::affineParallelize(AffineForOp forOp, |
| ArrayRef<LoopReduction> parallelReductions, |
| AffineParallelOp *resOp) { |
| // Fail early if there are iter arguments that are not reductions. |
| unsigned numReductions = parallelReductions.size(); |
| if (numReductions != forOp.getNumIterOperands()) |
| return failure(); |
| |
| Location loc = forOp.getLoc(); |
| OpBuilder outsideBuilder(forOp); |
| AffineMap lowerBoundMap = forOp.getLowerBoundMap(); |
| ValueRange lowerBoundOperands = forOp.getLowerBoundOperands(); |
| AffineMap upperBoundMap = forOp.getUpperBoundMap(); |
| ValueRange upperBoundOperands = forOp.getUpperBoundOperands(); |
| |
| // Creating empty 1-D affine.parallel op. |
| auto reducedValues = llvm::to_vector<4>(llvm::map_range( |
| parallelReductions, [](const LoopReduction &red) { return red.value; })); |
| auto reductionKinds = llvm::to_vector<4>(llvm::map_range( |
| parallelReductions, [](const LoopReduction &red) { return red.kind; })); |
| AffineParallelOp newPloop = outsideBuilder.create<AffineParallelOp>( |
| loc, ValueRange(reducedValues).getTypes(), reductionKinds, |
| llvm::ArrayRef(lowerBoundMap), lowerBoundOperands, |
| llvm::ArrayRef(upperBoundMap), upperBoundOperands, |
| llvm::ArrayRef(forOp.getStepAsInt())); |
| // Steal the body of the old affine for op. |
| newPloop.getRegion().takeBody(forOp.getRegion()); |
| Operation *yieldOp = &newPloop.getBody()->back(); |
| |
| // Handle the initial values of reductions because the parallel loop always |
| // starts from the neutral value. |
| SmallVector<Value> newResults; |
| newResults.reserve(numReductions); |
| for (unsigned i = 0; i < numReductions; ++i) { |
| Value init = forOp.getInits()[i]; |
| // This works because we are only handling single-op reductions at the |
| // moment. A switch on reduction kind or a mechanism to collect operations |
| // participating in the reduction will be necessary for multi-op reductions. |
| Operation *reductionOp = yieldOp->getOperand(i).getDefiningOp(); |
| assert(reductionOp && "yielded value is expected to be produced by an op"); |
| outsideBuilder.getInsertionBlock()->getOperations().splice( |
| outsideBuilder.getInsertionPoint(), newPloop.getBody()->getOperations(), |
| reductionOp); |
| reductionOp->setOperands({init, newPloop->getResult(i)}); |
| forOp->getResult(i).replaceAllUsesWith(reductionOp->getResult(0)); |
| } |
| |
| // Update the loop terminator to yield reduced values bypassing the reduction |
| // operation itself (now moved outside of the loop) and erase the block |
| // arguments that correspond to reductions. Note that the loop always has one |
| // "main" induction variable whenc coming from a non-parallel for. |
| unsigned numIVs = 1; |
| yieldOp->setOperands(reducedValues); |
| newPloop.getBody()->eraseArguments(numIVs, numReductions); |
| |
| forOp.erase(); |
| if (resOp) |
| *resOp = newPloop; |
| return success(); |
| } |
| |
| // Returns success if any hoisting happened. |
| LogicalResult mlir::affine::hoistAffineIfOp(AffineIfOp ifOp, bool *folded) { |
| // Bail out early if the ifOp returns a result. TODO: Consider how to |
| // properly support this case. |
| if (ifOp.getNumResults() != 0) |
| return failure(); |
| |
| // Apply canonicalization patterns and folding - this is necessary for the |
| // hoisting check to be correct (operands should be composed), and to be more |
| // effective (no unused operands). Since the pattern rewriter's folding is |
| // entangled with application of patterns, we may fold/end up erasing the op, |
| // in which case we return with `folded` being set. |
| RewritePatternSet patterns(ifOp.getContext()); |
| AffineIfOp::getCanonicalizationPatterns(patterns, ifOp.getContext()); |
| FrozenRewritePatternSet frozenPatterns(std::move(patterns)); |
| GreedyRewriteConfig config; |
| config.strictMode = GreedyRewriteStrictness::ExistingOps; |
| bool erased; |
| (void)applyOpPatternsGreedily(ifOp.getOperation(), frozenPatterns, config, |
| /*changed=*/nullptr, &erased); |
| if (erased) { |
| if (folded) |
| *folded = true; |
| return failure(); |
| } |
| if (folded) |
| *folded = false; |
| |
| // The folding above should have ensured this. |
| assert(llvm::all_of(ifOp.getOperands(), |
| [](Value v) { |
| return isTopLevelValue(v) || isAffineInductionVar(v); |
| }) && |
| "operands not composed"); |
| |
| // We are going hoist as high as possible. |
| // TODO: this could be customized in the future. |
| auto *hoistOverOp = getOutermostInvariantForOp(ifOp); |
| |
| AffineIfOp hoistedIfOp = ::hoistAffineIfOp(ifOp, hoistOverOp); |
| // Nothing to hoist over. |
| if (hoistedIfOp == ifOp) |
| return failure(); |
| |
| // Canonicalize to remove dead else blocks (happens whenever an 'if' moves up |
| // a sequence of affine.fors that are all perfectly nested). |
| (void)applyPatternsGreedily( |
| hoistedIfOp->getParentWithTrait<OpTrait::IsIsolatedFromAbove>(), |
| frozenPatterns); |
| |
| return success(); |
| } |
| |
| // Return the min expr after replacing the given dim. |
| AffineExpr mlir::affine::substWithMin(AffineExpr e, AffineExpr dim, |
| AffineExpr min, AffineExpr max, |
| bool positivePath) { |
| if (e == dim) |
| return positivePath ? min : max; |
| if (auto bin = dyn_cast<AffineBinaryOpExpr>(e)) { |
| AffineExpr lhs = bin.getLHS(); |
| AffineExpr rhs = bin.getRHS(); |
| if (bin.getKind() == mlir::AffineExprKind::Add) |
| return substWithMin(lhs, dim, min, max, positivePath) + |
| substWithMin(rhs, dim, min, max, positivePath); |
| |
| auto c1 = dyn_cast<AffineConstantExpr>(bin.getLHS()); |
| auto c2 = dyn_cast<AffineConstantExpr>(bin.getRHS()); |
| if (c1 && c1.getValue() < 0) |
| return getAffineBinaryOpExpr( |
| bin.getKind(), c1, substWithMin(rhs, dim, min, max, !positivePath)); |
| if (c2 && c2.getValue() < 0) |
| return getAffineBinaryOpExpr( |
| bin.getKind(), substWithMin(lhs, dim, min, max, !positivePath), c2); |
| return getAffineBinaryOpExpr( |
| bin.getKind(), substWithMin(lhs, dim, min, max, positivePath), |
| substWithMin(rhs, dim, min, max, positivePath)); |
| } |
| return e; |
| } |
| |
| void mlir::affine::normalizeAffineParallel(AffineParallelOp op) { |
| // Loops with min/max in bounds are not normalized at the moment. |
| if (op.hasMinMaxBounds()) |
| return; |
| |
| AffineMap lbMap = op.getLowerBoundsMap(); |
| SmallVector<int64_t, 8> steps = op.getSteps(); |
| // No need to do any work if the parallel op is already normalized. |
| bool isAlreadyNormalized = |
| llvm::all_of(llvm::zip(steps, lbMap.getResults()), [](auto tuple) { |
| int64_t step = std::get<0>(tuple); |
| auto lbExpr = dyn_cast<AffineConstantExpr>(std::get<1>(tuple)); |
| return lbExpr && lbExpr.getValue() == 0 && step == 1; |
| }); |
| if (isAlreadyNormalized) |
| return; |
| |
| AffineValueMap ranges; |
| AffineValueMap::difference(op.getUpperBoundsValueMap(), |
| op.getLowerBoundsValueMap(), &ranges); |
| auto builder = OpBuilder::atBlockBegin(op.getBody()); |
| auto zeroExpr = builder.getAffineConstantExpr(0); |
| SmallVector<AffineExpr, 8> lbExprs; |
| SmallVector<AffineExpr, 8> ubExprs; |
| for (unsigned i = 0, e = steps.size(); i < e; ++i) { |
| int64_t step = steps[i]; |
| |
| // Adjust the lower bound to be 0. |
| lbExprs.push_back(zeroExpr); |
| |
| // Adjust the upper bound expression: 'range / step'. |
| AffineExpr ubExpr = ranges.getResult(i).ceilDiv(step); |
| ubExprs.push_back(ubExpr); |
| |
| // Adjust the corresponding IV: 'lb + i * step'. |
| BlockArgument iv = op.getBody()->getArgument(i); |
| AffineExpr lbExpr = lbMap.getResult(i); |
| unsigned nDims = lbMap.getNumDims(); |
| auto expr = lbExpr + builder.getAffineDimExpr(nDims) * step; |
| auto map = AffineMap::get(/*dimCount=*/nDims + 1, |
| /*symbolCount=*/lbMap.getNumSymbols(), expr); |
| |
| // Use an 'affine.apply' op that will be simplified later in subsequent |
| // canonicalizations. |
| OperandRange lbOperands = op.getLowerBoundsOperands(); |
| OperandRange dimOperands = lbOperands.take_front(nDims); |
| OperandRange symbolOperands = lbOperands.drop_front(nDims); |
| SmallVector<Value, 8> applyOperands{dimOperands}; |
| applyOperands.push_back(iv); |
| applyOperands.append(symbolOperands.begin(), symbolOperands.end()); |
| auto apply = builder.create<AffineApplyOp>(op.getLoc(), map, applyOperands); |
| iv.replaceAllUsesExcept(apply, apply); |
| } |
| |
| SmallVector<int64_t, 8> newSteps(op.getNumDims(), 1); |
| op.setSteps(newSteps); |
| auto newLowerMap = AffineMap::get( |
| /*dimCount=*/0, /*symbolCount=*/0, lbExprs, op.getContext()); |
| op.setLowerBounds({}, newLowerMap); |
| auto newUpperMap = AffineMap::get(ranges.getNumDims(), ranges.getNumSymbols(), |
| ubExprs, op.getContext()); |
| op.setUpperBounds(ranges.getOperands(), newUpperMap); |
| } |
| |
| LogicalResult mlir::affine::normalizeAffineFor(AffineForOp op, |
| bool promoteSingleIter) { |
| if (promoteSingleIter && succeeded(promoteIfSingleIteration(op))) |
| return success(); |
| |
| // Check if the forop is already normalized. |
| if (op.hasConstantLowerBound() && (op.getConstantLowerBound() == 0) && |
| (op.getStep() == 1)) |
| return success(); |
| |
| // Check if the lower bound has a single result only. Loops with a max lower |
| // bound can't be normalized without additional support like |
| // affine.execute_region's. If the lower bound does not have a single result |
| // then skip this op. |
| if (op.getLowerBoundMap().getNumResults() != 1) |
| return failure(); |
| |
| Location loc = op.getLoc(); |
| OpBuilder opBuilder(op); |
| int64_t origLoopStep = op.getStepAsInt(); |
| |
| // Construct the new upper bound value map. |
| AffineMap oldLbMap = op.getLowerBoundMap(); |
| // The upper bound can have multiple results. To use |
| // AffineValueMap::difference, we need to have the same number of results in |
| // both lower and upper bound maps. So, we just create a value map for the |
| // lower bound with the only available lower bound result repeated to pad up |
| // to the number of upper bound results. |
| SmallVector<AffineExpr> lbExprs(op.getUpperBoundMap().getNumResults(), |
| op.getLowerBoundMap().getResult(0)); |
| AffineValueMap lbMap(oldLbMap, op.getLowerBoundOperands()); |
| AffineMap paddedLbMap = |
| AffineMap::get(oldLbMap.getNumDims(), oldLbMap.getNumSymbols(), lbExprs, |
| op.getContext()); |
| AffineValueMap paddedLbValueMap(paddedLbMap, op.getLowerBoundOperands()); |
| AffineValueMap ubValueMap(op.getUpperBoundMap(), op.getUpperBoundOperands()); |
| AffineValueMap newUbValueMap; |
| // Compute the `upper bound - lower bound`. |
| AffineValueMap::difference(ubValueMap, paddedLbValueMap, &newUbValueMap); |
| (void)newUbValueMap.canonicalize(); |
| |
| // Scale down the upper bound value map by the loop step. |
| unsigned numResult = newUbValueMap.getNumResults(); |
| SmallVector<AffineExpr> scaleDownExprs(numResult); |
| for (unsigned i = 0; i < numResult; ++i) |
| scaleDownExprs[i] = opBuilder.getAffineDimExpr(i).ceilDiv(origLoopStep); |
| // `scaleDownMap` is (d0, d1, ..., d_n) -> (d0 / step, d1 / step, ..., d_n / |
| // step). Where `n` is the number of results in the upper bound map. |
| AffineMap scaleDownMap = |
| AffineMap::get(numResult, 0, scaleDownExprs, op.getContext()); |
| AffineMap newUbMap = scaleDownMap.compose(newUbValueMap.getAffineMap()); |
| |
| // Set the newly create upper bound map and operands. |
| op.setUpperBound(newUbValueMap.getOperands(), newUbMap); |
| op.setLowerBound({}, opBuilder.getConstantAffineMap(0)); |
| op.setStep(1); |
| |
| // Calculate the Value of new loopIV. Create affine.apply for the value of |
| // the loopIV in normalized loop. |
| opBuilder.setInsertionPointToStart(op.getBody()); |
| // Construct an affine.apply op mapping the new IV to the old IV. |
| AffineMap scaleIvMap = |
| AffineMap::get(1, 0, -opBuilder.getAffineDimExpr(0) * origLoopStep); |
| AffineValueMap scaleIvValueMap(scaleIvMap, ValueRange{op.getInductionVar()}); |
| AffineValueMap newIvToOldIvMap; |
| AffineValueMap::difference(lbMap, scaleIvValueMap, &newIvToOldIvMap); |
| (void)newIvToOldIvMap.canonicalize(); |
| auto newIV = opBuilder.create<AffineApplyOp>( |
| loc, newIvToOldIvMap.getAffineMap(), newIvToOldIvMap.getOperands()); |
| op.getInductionVar().replaceAllUsesExcept(newIV->getResult(0), newIV); |
| return success(); |
| } |
| |
| /// Returns true if the memory operation of `destAccess` depends on `srcAccess` |
| /// inside of the innermost common surrounding affine loop between the two |
| /// accesses. |
| static bool mustReachAtInnermost(const MemRefAccess &srcAccess, |
| const MemRefAccess &destAccess) { |
| // Affine dependence analysis is possible only if both ops in the same |
| // AffineScope. |
| if (getAffineAnalysisScope(srcAccess.opInst) != |
| getAffineAnalysisScope(destAccess.opInst)) |
| return false; |
| |
| unsigned nsLoops = |
| getNumCommonSurroundingLoops(*srcAccess.opInst, *destAccess.opInst); |
| DependenceResult result = |
| checkMemrefAccessDependence(srcAccess, destAccess, nsLoops + 1); |
| return hasDependence(result); |
| } |
| |
| /// Returns true if `srcMemOp` may have an effect on `destMemOp` within the |
| /// scope of the outermost `minSurroundingLoops` loops that surround them. |
| /// `srcMemOp` and `destMemOp` are expected to be affine read/write ops. |
| static bool mayHaveEffect(Operation *srcMemOp, Operation *destMemOp, |
| unsigned minSurroundingLoops) { |
| MemRefAccess srcAccess(srcMemOp); |
| MemRefAccess destAccess(destMemOp); |
| |
| // Affine dependence analysis here is applicable only if both ops operate on |
| // the same memref and if `srcMemOp` and `destMemOp` are in the same |
| // AffineScope. Also, we can only check if our affine scope is isolated from |
| // above; otherwise, values can from outside of the affine scope that the |
| // check below cannot analyze. |
| Region *srcScope = getAffineAnalysisScope(srcMemOp); |
| if (srcAccess.memref == destAccess.memref && |
| srcScope == getAffineAnalysisScope(destMemOp)) { |
| unsigned nsLoops = getNumCommonSurroundingLoops(*srcMemOp, *destMemOp); |
| FlatAffineValueConstraints dependenceConstraints; |
| for (unsigned d = nsLoops + 1; d > minSurroundingLoops; d--) { |
| DependenceResult result = checkMemrefAccessDependence( |
| srcAccess, destAccess, d, &dependenceConstraints, |
| /*dependenceComponents=*/nullptr); |
| // A dependence failure or the presence of a dependence implies a |
| // side effect. |
| if (!noDependence(result)) |
| return true; |
| } |
| // No side effect was seen. |
| return false; |
| } |
| // TODO: Check here if the memrefs alias: there is no side effect if |
| // `srcAccess.memref` and `destAccess.memref` don't alias. |
| return true; |
| } |
| |
| template <typename EffectType, typename T> |
| bool mlir::affine::hasNoInterveningEffect( |
| Operation *start, T memOp, |
| llvm::function_ref<bool(Value, Value)> mayAlias) { |
| // A boolean representing whether an intervening operation could have impacted |
| // memOp. |
| bool hasSideEffect = false; |
| |
| // Check whether the effect on memOp can be caused by a given operation op. |
| Value memref = memOp.getMemRef(); |
| std::function<void(Operation *)> checkOperation = [&](Operation *op) { |
| // If the effect has alreay been found, early exit, |
| if (hasSideEffect) |
| return; |
| |
| if (auto memEffect = dyn_cast<MemoryEffectOpInterface>(op)) { |
| SmallVector<MemoryEffects::EffectInstance, 1> effects; |
| memEffect.getEffects(effects); |
| |
| bool opMayHaveEffect = false; |
| for (auto effect : effects) { |
| // If op causes EffectType on a potentially aliasing location for |
| // memOp, mark as having the effect. |
| if (isa<EffectType>(effect.getEffect())) { |
| if (effect.getValue() && effect.getValue() != memref && |
| !mayAlias(effect.getValue(), memref)) |
| continue; |
| opMayHaveEffect = true; |
| break; |
| } |
| } |
| |
| if (!opMayHaveEffect) |
| return; |
| |
| // If the side effect comes from an affine read or write, try to |
| // prove the side effecting `op` cannot reach `memOp`. |
| if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) { |
| // For ease, let's consider the case that `op` is a store and |
| // we're looking for other potential stores that overwrite memory after |
| // `start`, and before being read in `memOp`. In this case, we only |
| // need to consider other potential stores with depth > |
| // minSurroundingLoops since `start` would overwrite any store with a |
| // smaller number of surrounding loops before. |
| unsigned minSurroundingLoops = |
| getNumCommonSurroundingLoops(*start, *memOp); |
| if (mayHaveEffect(op, memOp, minSurroundingLoops)) |
| hasSideEffect = true; |
| return; |
| } |
| |
| // We have an op with a memory effect and we cannot prove if it |
| // intervenes. |
| hasSideEffect = true; |
| return; |
| } |
| |
| if (op->hasTrait<OpTrait::HasRecursiveMemoryEffects>()) { |
| // Recurse into the regions for this op and check whether the internal |
| // operations may have the side effect `EffectType` on memOp. |
| for (Region ®ion : op->getRegions()) |
| for (Block &block : region) |
| for (Operation &op : block) |
| checkOperation(&op); |
| return; |
| } |
| |
| // Otherwise, conservatively assume generic operations have the effect |
| // on the operation |
| hasSideEffect = true; |
| }; |
| |
| // Check all paths from ancestor op `parent` to the operation `to` for the |
| // effect. It is known that `to` must be contained within `parent`. |
| auto until = [&](Operation *parent, Operation *to) { |
| // TODO check only the paths from `parent` to `to`. |
| // Currently we fallback and check the entire parent op, rather than |
| // just the paths from the parent path, stopping after reaching `to`. |
| // This is conservatively correct, but could be made more aggressive. |
| assert(parent->isAncestor(to)); |
| checkOperation(parent); |
| }; |
| |
| // Check for all paths from operation `from` to operation `untilOp` for the |
| // given memory effect. |
| std::function<void(Operation *, Operation *)> recur = |
| [&](Operation *from, Operation *untilOp) { |
| assert( |
| from->getParentRegion()->isAncestor(untilOp->getParentRegion()) && |
| "Checking for side effect between two operations without a common " |
| "ancestor"); |
| |
| // If the operations are in different regions, recursively consider all |
| // path from `from` to the parent of `to` and all paths from the parent |
| // of `to` to `to`. |
| if (from->getParentRegion() != untilOp->getParentRegion()) { |
| recur(from, untilOp->getParentOp()); |
| until(untilOp->getParentOp(), untilOp); |
| return; |
| } |
| |
| // Now, assuming that `from` and `to` exist in the same region, perform |
| // a CFG traversal to check all the relevant operations. |
| |
| // Additional blocks to consider. |
| SmallVector<Block *, 2> todoBlocks; |
| { |
| // First consider the parent block of `from` an check all operations |
| // after `from`. |
| for (auto iter = ++from->getIterator(), end = from->getBlock()->end(); |
| iter != end && &*iter != untilOp; ++iter) { |
| checkOperation(&*iter); |
| } |
| |
| // If the parent of `from` doesn't contain `to`, add the successors |
| // to the list of blocks to check. |
| if (untilOp->getBlock() != from->getBlock()) |
| for (Block *succ : from->getBlock()->getSuccessors()) |
| todoBlocks.push_back(succ); |
| } |
| |
| SmallPtrSet<Block *, 4> done; |
| // Traverse the CFG until hitting `to`. |
| while (!todoBlocks.empty()) { |
| Block *blk = todoBlocks.pop_back_val(); |
| if (done.count(blk)) |
| continue; |
| done.insert(blk); |
| for (auto &op : *blk) { |
| if (&op == untilOp) |
| break; |
| checkOperation(&op); |
| if (&op == blk->getTerminator()) |
| for (Block *succ : blk->getSuccessors()) |
| todoBlocks.push_back(succ); |
| } |
| } |
| }; |
| recur(start, memOp); |
| return !hasSideEffect; |
| } |
| |
| /// Attempt to eliminate loadOp by replacing it with a value stored into memory |
| /// which the load is guaranteed to retrieve. This check involves three |
| /// components: 1) The store and load must be on the same location 2) The store |
| /// must dominate (and therefore must always occur prior to) the load 3) No |
| /// other operations will overwrite the memory loaded between the given load |
| /// and store. If such a value exists, the replaced `loadOp` will be added to |
| /// `loadOpsToErase` and its memref will be added to `memrefsToErase`. |
| static void forwardStoreToLoad( |
| AffineReadOpInterface loadOp, SmallVectorImpl<Operation *> &loadOpsToErase, |
| SmallPtrSetImpl<Value> &memrefsToErase, DominanceInfo &domInfo, |
| llvm::function_ref<bool(Value, Value)> mayAlias) { |
| |
| // The store op candidate for forwarding that satisfies all conditions |
| // to replace the load, if any. |
| Operation *lastWriteStoreOp = nullptr; |
| |
| for (auto *user : loadOp.getMemRef().getUsers()) { |
| auto storeOp = dyn_cast<AffineWriteOpInterface>(user); |
| if (!storeOp) |
| continue; |
| MemRefAccess srcAccess(storeOp); |
| MemRefAccess destAccess(loadOp); |
| |
| // 1. Check if the store and the load have mathematically equivalent |
| // affine access functions; this implies that they statically refer to the |
| // same single memref element. As an example this filters out cases like: |
| // store %A[%i0 + 1] |
| // load %A[%i0] |
| // store %A[%M] |
| // load %A[%N] |
| // Use the AffineValueMap difference based memref access equality checking. |
| if (srcAccess != destAccess) |
| continue; |
| |
| // 2. The store has to dominate the load op to be candidate. |
| if (!domInfo.dominates(storeOp, loadOp)) |
| continue; |
| |
| // 3. The store must reach the load. Access function equivalence only |
| // guarantees this for accesses in the same block. The load could be in a |
| // nested block that is unreachable. |
| if (!mustReachAtInnermost(srcAccess, destAccess)) |
| continue; |
| |
| // 4. Ensure there is no intermediate operation which could replace the |
| // value in memory. |
| if (!affine::hasNoInterveningEffect<MemoryEffects::Write>(storeOp, loadOp, |
| mayAlias)) |
| continue; |
| |
| // We now have a candidate for forwarding. |
| assert(lastWriteStoreOp == nullptr && |
| "multiple simultaneous replacement stores"); |
| lastWriteStoreOp = storeOp; |
| } |
| |
| if (!lastWriteStoreOp) |
| return; |
| |
| // Perform the actual store to load forwarding. |
| Value storeVal = |
| cast<AffineWriteOpInterface>(lastWriteStoreOp).getValueToStore(); |
| // Check if 2 values have the same shape. This is needed for affine vector |
| // loads and stores. |
| if (storeVal.getType() != loadOp.getValue().getType()) |
| return; |
| loadOp.getValue().replaceAllUsesWith(storeVal); |
| // Record the memref for a later sweep to optimize away. |
| memrefsToErase.insert(loadOp.getMemRef()); |
| // Record this to erase later. |
| loadOpsToErase.push_back(loadOp); |
| } |
| |
| template bool |
| mlir::affine::hasNoInterveningEffect<mlir::MemoryEffects::Read, |
| affine::AffineReadOpInterface>( |
| mlir::Operation *, affine::AffineReadOpInterface, |
| llvm::function_ref<bool(Value, Value)>); |
| |
| // This attempts to find stores which have no impact on the final result. |
| // A writing op writeA will be eliminated if there exists an op writeB if |
| // 1) writeA and writeB have mathematically equivalent affine access functions. |
| // 2) writeB postdominates writeA. |
| // 3) There is no potential read between writeA and writeB. |
| static void findUnusedStore(AffineWriteOpInterface writeA, |
| SmallVectorImpl<Operation *> &opsToErase, |
| PostDominanceInfo &postDominanceInfo, |
| llvm::function_ref<bool(Value, Value)> mayAlias) { |
| |
| for (Operation *user : writeA.getMemRef().getUsers()) { |
| // Only consider writing operations. |
| auto writeB = dyn_cast<AffineWriteOpInterface>(user); |
| if (!writeB) |
| continue; |
| |
| // The operations must be distinct. |
| if (writeB == writeA) |
| continue; |
| |
| // Both operations must lie in the same region. |
| if (writeB->getParentRegion() != writeA->getParentRegion()) |
| continue; |
| |
| // Both operations must write to the same memory. |
| MemRefAccess srcAccess(writeB); |
| MemRefAccess destAccess(writeA); |
| |
| if (srcAccess != destAccess) |
| continue; |
| |
| // writeB must postdominate writeA. |
| if (!postDominanceInfo.postDominates(writeB, writeA)) |
| continue; |
| |
| // There cannot be an operation which reads from memory between |
| // the two writes. |
| if (!affine::hasNoInterveningEffect<MemoryEffects::Read>(writeA, writeB, |
| mayAlias)) |
| continue; |
| |
| opsToErase.push_back(writeA); |
| break; |
| } |
| } |
| |
| // The load to load forwarding / redundant load elimination is similar to the |
| // store to load forwarding. |
| // loadA will be be replaced with loadB if: |
| // 1) loadA and loadB have mathematically equivalent affine access functions. |
| // 2) loadB dominates loadA. |
| // 3) There is no write between loadA and loadB. |
| static void loadCSE(AffineReadOpInterface loadA, |
| SmallVectorImpl<Operation *> &loadOpsToErase, |
| DominanceInfo &domInfo, |
| llvm::function_ref<bool(Value, Value)> mayAlias) { |
| SmallVector<AffineReadOpInterface, 4> loadCandidates; |
| for (auto *user : loadA.getMemRef().getUsers()) { |
| auto loadB = dyn_cast<AffineReadOpInterface>(user); |
| if (!loadB || loadB == loadA) |
| continue; |
| |
| MemRefAccess srcAccess(loadB); |
| MemRefAccess destAccess(loadA); |
| |
| // 1. The accesses should be to be to the same location. |
| if (srcAccess != destAccess) { |
| continue; |
| } |
| |
| // 2. loadB should dominate loadA. |
| if (!domInfo.dominates(loadB, loadA)) |
| continue; |
| |
| // 3. There should not be a write between loadA and loadB. |
| if (!affine::hasNoInterveningEffect<MemoryEffects::Write>( |
| loadB.getOperation(), loadA, mayAlias)) |
| continue; |
| |
| // Check if two values have the same shape. This is needed for affine vector |
| // loads. |
| if (loadB.getValue().getType() != loadA.getValue().getType()) |
| continue; |
| |
| loadCandidates.push_back(loadB); |
| } |
| |
| // Of the legal load candidates, use the one that dominates all others |
| // to minimize the subsequent need to loadCSE |
| Value loadB; |
| for (AffineReadOpInterface option : loadCandidates) { |
| if (llvm::all_of(loadCandidates, [&](AffineReadOpInterface depStore) { |
| return depStore == option || |
| domInfo.dominates(option.getOperation(), |
| depStore.getOperation()); |
| })) { |
| loadB = option.getValue(); |
| break; |
| } |
| } |
| |
| if (loadB) { |
| loadA.getValue().replaceAllUsesWith(loadB); |
| // Record this to erase later. |
| loadOpsToErase.push_back(loadA); |
| } |
| } |
| |
| // The store to load forwarding and load CSE rely on three conditions: |
| // |
| // 1) store/load providing a replacement value and load being replaced need to |
| // have mathematically equivalent affine access functions (checked after full |
| // composition of load/store operands); this implies that they access the same |
| // single memref element for all iterations of the common surrounding loop, |
| // |
| // 2) the store/load op should dominate the load op, |
| // |
| // 3) no operation that may write to memory read by the load being replaced can |
| // occur after executing the instruction (load or store) providing the |
| // replacement value and before the load being replaced (thus potentially |
| // allowing overwriting the memory read by the load). |
| // |
| // The above conditions are simple to check, sufficient, and powerful for most |
| // cases in practice - they are sufficient, but not necessary --- since they |
| // don't reason about loops that are guaranteed to execute at least once or |
| // multiple sources to forward from. |
| // |
| // TODO: more forwarding can be done when support for |
| // loop/conditional live-out SSA values is available. |
| // TODO: do general dead store elimination for memref's. This pass |
| // currently only eliminates the stores only if no other loads/uses (other |
| // than dealloc) remain. |
| // |
| void mlir::affine::affineScalarReplace(func::FuncOp f, DominanceInfo &domInfo, |
| PostDominanceInfo &postDomInfo, |
| AliasAnalysis &aliasAnalysis) { |
| // Load op's whose results were replaced by those forwarded from stores. |
| SmallVector<Operation *, 8> opsToErase; |
| |
| // A list of memref's that are potentially dead / could be eliminated. |
| SmallPtrSet<Value, 4> memrefsToErase; |
| |
| auto mayAlias = [&](Value val1, Value val2) -> bool { |
| return !aliasAnalysis.alias(val1, val2).isNo(); |
| }; |
| |
| // Walk all load's and perform store to load forwarding. |
| f.walk([&](AffineReadOpInterface loadOp) { |
| forwardStoreToLoad(loadOp, opsToErase, memrefsToErase, domInfo, mayAlias); |
| }); |
| for (auto *op : opsToErase) |
| op->erase(); |
| opsToErase.clear(); |
| |
| // Walk all store's and perform unused store elimination |
| f.walk([&](AffineWriteOpInterface storeOp) { |
| findUnusedStore(storeOp, opsToErase, postDomInfo, mayAlias); |
| }); |
| for (auto *op : opsToErase) |
| op->erase(); |
| opsToErase.clear(); |
| |
| // Check if the store fwd'ed memrefs are now left with only stores and |
| // deallocs and can thus be completely deleted. Note: the canonicalize pass |
| // should be able to do this as well, but we'll do it here since we collected |
| // these anyway. |
| for (auto memref : memrefsToErase) { |
| // If the memref hasn't been locally alloc'ed, skip. |
| Operation *defOp = memref.getDefiningOp(); |
| if (!defOp || !hasSingleEffect<MemoryEffects::Allocate>(defOp, memref)) |
| // TODO: if the memref was returned by a 'call' operation, we |
| // could still erase it if the call had no side-effects. |
| continue; |
| if (llvm::any_of(memref.getUsers(), [&](Operation *ownerOp) { |
| return !isa<AffineWriteOpInterface>(ownerOp) && |
| !hasSingleEffect<MemoryEffects::Free>(ownerOp, memref); |
| })) |
| continue; |
| |
| // Erase all stores, the dealloc, and the alloc on the memref. |
| for (auto *user : llvm::make_early_inc_range(memref.getUsers())) |
| user->erase(); |
| defOp->erase(); |
| } |
| |
| // To eliminate as many loads as possible, run load CSE after eliminating |
| // stores. Otherwise, some stores are wrongly seen as having an intervening |
| // effect. |
| f.walk([&](AffineReadOpInterface loadOp) { |
| loadCSE(loadOp, opsToErase, domInfo, mayAlias); |
| }); |
| for (auto *op : opsToErase) |
| op->erase(); |
| } |
| |
| // Private helper function to transform memref.load with reduced rank. |
| // This function will modify the indices of the memref.load to match the |
| // newMemRef. |
| LogicalResult transformMemRefLoadWithReducedRank( |
| Operation *op, Value oldMemRef, Value newMemRef, unsigned memRefOperandPos, |
| ArrayRef<Value> extraIndices, ArrayRef<Value> extraOperands, |
| ArrayRef<Value> symbolOperands, AffineMap indexRemap) { |
| unsigned oldMemRefRank = cast<MemRefType>(oldMemRef.getType()).getRank(); |
| unsigned newMemRefRank = cast<MemRefType>(newMemRef.getType()).getRank(); |
| unsigned oldMapNumInputs = oldMemRefRank; |
| SmallVector<Value, 4> oldMapOperands( |
| op->operand_begin() + memRefOperandPos + 1, |
| op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs); |
| SmallVector<Value, 4> oldMemRefOperands; |
| oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end()); |
| SmallVector<Value, 4> remapOperands; |
| remapOperands.reserve(extraOperands.size() + oldMemRefRank + |
| symbolOperands.size()); |
| remapOperands.append(extraOperands.begin(), extraOperands.end()); |
| remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end()); |
| remapOperands.append(symbolOperands.begin(), symbolOperands.end()); |
| |
| SmallVector<Value, 4> remapOutputs; |
| remapOutputs.reserve(oldMemRefRank); |
| SmallVector<Value, 4> affineApplyOps; |
| |
| OpBuilder builder(op); |
| |
| if (indexRemap && |
| indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) { |
| // Remapped indices. |
| for (auto resultExpr : indexRemap.getResults()) { |
| auto singleResMap = AffineMap::get( |
| indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr); |
| auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap, |
| remapOperands); |
| remapOutputs.push_back(afOp); |
| affineApplyOps.push_back(afOp); |
| } |
| } else { |
| // No remapping specified. |
| remapOutputs.assign(remapOperands.begin(), remapOperands.end()); |
| } |
| |
| SmallVector<Value, 4> newMapOperands; |
| newMapOperands.reserve(newMemRefRank); |
| |
| // Prepend 'extraIndices' in 'newMapOperands'. |
| for (Value extraIndex : extraIndices) { |
| assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) && |
| "invalid memory op index"); |
| newMapOperands.push_back(extraIndex); |
| } |
| |
| // Append 'remapOutputs' to 'newMapOperands'. |
| newMapOperands.append(remapOutputs.begin(), remapOutputs.end()); |
| |
| // Create new fully composed AffineMap for new op to be created. |
| assert(newMapOperands.size() == newMemRefRank); |
| |
| OperationState state(op->getLoc(), op->getName()); |
| // Construct the new operation using this memref. |
| state.operands.reserve(newMapOperands.size() + extraIndices.size()); |
| state.operands.push_back(newMemRef); |
| |
| // Insert the new memref map operands. |
| state.operands.append(newMapOperands.begin(), newMapOperands.end()); |
| |
| state.types.reserve(op->getNumResults()); |
| for (auto result : op->getResults()) |
| state.types.push_back(result.getType()); |
| |
| // Copy over the attributes from the old operation to the new operation. |
| for (auto namedAttr : op->getAttrs()) { |
| state.attributes.push_back(namedAttr); |
| } |
| |
| // Create the new operation. |
| auto *repOp = builder.create(state); |
| op->replaceAllUsesWith(repOp); |
| op->erase(); |
| |
| return success(); |
| } |
| // Perform the replacement in `op`. |
| LogicalResult mlir::affine::replaceAllMemRefUsesWith( |
| Value oldMemRef, Value newMemRef, Operation *op, |
| ArrayRef<Value> extraIndices, AffineMap indexRemap, |
| ArrayRef<Value> extraOperands, ArrayRef<Value> symbolOperands, |
| bool allowNonDereferencingOps) { |
| unsigned newMemRefRank = cast<MemRefType>(newMemRef.getType()).getRank(); |
| (void)newMemRefRank; // unused in opt mode |
| unsigned oldMemRefRank = cast<MemRefType>(oldMemRef.getType()).getRank(); |
| (void)oldMemRefRank; // unused in opt mode |
| if (indexRemap) { |
| assert(indexRemap.getNumSymbols() == symbolOperands.size() && |
| "symbolic operand count mismatch"); |
| assert(indexRemap.getNumInputs() == |
| extraOperands.size() + oldMemRefRank + symbolOperands.size()); |
| assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank); |
| } else { |
| assert(oldMemRefRank + extraIndices.size() == newMemRefRank); |
| } |
| |
| // Assert same elemental type. |
| assert(cast<MemRefType>(oldMemRef.getType()).getElementType() == |
| cast<MemRefType>(newMemRef.getType()).getElementType()); |
| |
| SmallVector<unsigned, 2> usePositions; |
| for (const auto &opEntry : llvm::enumerate(op->getOperands())) { |
| if (opEntry.value() == oldMemRef) |
| usePositions.push_back(opEntry.index()); |
| } |
| |
| // If memref doesn't appear, nothing to do. |
| if (usePositions.empty()) |
| return success(); |
| |
| if (usePositions.size() > 1) { |
| // TODO: extend it for this case when needed (rare). |
| assert(false && "multiple dereferencing uses in a single op not supported"); |
| return failure(); |
| } |
| |
| unsigned memRefOperandPos = usePositions.front(); |
| |
| OpBuilder builder(op); |
| // The following checks if op is dereferencing memref and performs the access |
| // index rewrites. |
| auto affMapAccInterface = dyn_cast<AffineMapAccessInterface>(op); |
| if (!affMapAccInterface) { |
| if (!allowNonDereferencingOps) { |
| // Failure: memref used in a non-dereferencing context (potentially |
| // escapes); no replacement in these cases unless allowNonDereferencingOps |
| // is set. |
| return failure(); |
| } |
| |
| // Check if it is a memref.load |
| auto memrefLoad = dyn_cast<memref::LoadOp>(op); |
| bool isReductionLike = |
| indexRemap.getNumResults() < indexRemap.getNumInputs(); |
| if (!memrefLoad || !isReductionLike) { |
| op->setOperand(memRefOperandPos, newMemRef); |
| return success(); |
| } |
| |
| return transformMemRefLoadWithReducedRank( |
| op, oldMemRef, newMemRef, memRefOperandPos, extraIndices, extraOperands, |
| symbolOperands, indexRemap); |
| } |
| // Perform index rewrites for the dereferencing op and then replace the op |
| NamedAttribute oldMapAttrPair = |
| affMapAccInterface.getAffineMapAttrForMemRef(oldMemRef); |
| AffineMap oldMap = cast<AffineMapAttr>(oldMapAttrPair.getValue()).getValue(); |
| unsigned oldMapNumInputs = oldMap.getNumInputs(); |
| SmallVector<Value, 4> oldMapOperands( |
| op->operand_begin() + memRefOperandPos + 1, |
| op->operand_begin() + memRefOperandPos + 1 + oldMapNumInputs); |
| |
| // Apply 'oldMemRefOperands = oldMap(oldMapOperands)'. |
| SmallVector<Value, 4> oldMemRefOperands; |
| SmallVector<Value, 4> affineApplyOps; |
| oldMemRefOperands.reserve(oldMemRefRank); |
| if (oldMap != builder.getMultiDimIdentityMap(oldMap.getNumDims())) { |
| for (auto resultExpr : oldMap.getResults()) { |
| auto singleResMap = AffineMap::get(oldMap.getNumDims(), |
| oldMap.getNumSymbols(), resultExpr); |
| auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap, |
| oldMapOperands); |
| oldMemRefOperands.push_back(afOp); |
| affineApplyOps.push_back(afOp); |
| } |
| } else { |
| oldMemRefOperands.assign(oldMapOperands.begin(), oldMapOperands.end()); |
| } |
| |
| // Construct new indices as a remap of the old ones if a remapping has been |
| // provided. The indices of a memref come right after it, i.e., |
| // at position memRefOperandPos + 1. |
| SmallVector<Value, 4> remapOperands; |
| remapOperands.reserve(extraOperands.size() + oldMemRefRank + |
| symbolOperands.size()); |
| remapOperands.append(extraOperands.begin(), extraOperands.end()); |
| remapOperands.append(oldMemRefOperands.begin(), oldMemRefOperands.end()); |
| remapOperands.append(symbolOperands.begin(), symbolOperands.end()); |
| |
| SmallVector<Value, 4> remapOutputs; |
| remapOutputs.reserve(oldMemRefRank); |
| |
| if (indexRemap && |
| indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) { |
| // Remapped indices. |
| for (auto resultExpr : indexRemap.getResults()) { |
| auto singleResMap = AffineMap::get( |
| indexRemap.getNumDims(), indexRemap.getNumSymbols(), resultExpr); |
| auto afOp = builder.create<AffineApplyOp>(op->getLoc(), singleResMap, |
| remapOperands); |
| remapOutputs.push_back(afOp); |
| affineApplyOps.push_back(afOp); |
| } |
| } else { |
| // No remapping specified. |
| remapOutputs.assign(remapOperands.begin(), remapOperands.end()); |
| } |
| |
| SmallVector<Value, 4> newMapOperands; |
| newMapOperands.reserve(newMemRefRank); |
| |
| // Prepend 'extraIndices' in 'newMapOperands'. |
| for (Value extraIndex : extraIndices) { |
| assert((isValidDim(extraIndex) || isValidSymbol(extraIndex)) && |
| "invalid memory op index"); |
| newMapOperands.push_back(extraIndex); |
| } |
| |
| // Append 'remapOutputs' to 'newMapOperands'. |
| newMapOperands.append(remapOutputs.begin(), remapOutputs.end()); |
| |
| // Create new fully composed AffineMap for new op to be created. |
| assert(newMapOperands.size() == newMemRefRank); |
| auto newMap = builder.getMultiDimIdentityMap(newMemRefRank); |
| fullyComposeAffineMapAndOperands(&newMap, &newMapOperands); |
| newMap = simplifyAffineMap(newMap); |
| canonicalizeMapAndOperands(&newMap, &newMapOperands); |
| // Remove any affine.apply's that became dead as a result of composition. |
| for (Value value : affineApplyOps) |
| if (value.use_empty()) |
| value.getDefiningOp()->erase(); |
| |
| OperationState state(op->getLoc(), op->getName()); |
| // Construct the new operation using this memref. |
| state.operands.reserve(op->getNumOperands() + extraIndices.size()); |
| // Insert the non-memref operands. |
| state.operands.append(op->operand_begin(), |
| op->operand_begin() + memRefOperandPos); |
| // Insert the new memref value. |
| state.operands.push_back(newMemRef); |
| |
| // Insert the new memref map operands. |
| state.operands.append(newMapOperands.begin(), newMapOperands.end()); |
| |
| // Insert the remaining operands unmodified. |
| state.operands.append(op->operand_begin() + memRefOperandPos + 1 + |
| oldMapNumInputs, |
| op->operand_end()); |
| |
| // Result types don't change. Both memref's are of the same elemental type. |
| state.types.reserve(op->getNumResults()); |
| for (auto result : op->getResults()) |
| state.types.push_back(result.getType()); |
| |
| // Add attribute for 'newMap', other Attributes do not change. |
| auto newMapAttr = AffineMapAttr::get(newMap); |
| for (auto namedAttr : op->getAttrs()) { |
| if (namedAttr.getName() == oldMapAttrPair.getName()) |
| state.attributes.push_back({namedAttr.getName(), newMapAttr}); |
| else |
| state.attributes.push_back(namedAttr); |
| } |
| |
| // Create the new operation. |
| auto *repOp = builder.create(state); |
| op->replaceAllUsesWith(repOp); |
| op->erase(); |
| |
| return success(); |
| } |
| |
| LogicalResult mlir::affine::replaceAllMemRefUsesWith( |
| Value oldMemRef, Value newMemRef, ArrayRef<Value> extraIndices, |
| AffineMap indexRemap, ArrayRef<Value> extraOperands, |
| ArrayRef<Value> symbolOperands, Operation *domOpFilter, |
| Operation *postDomOpFilter, bool allowNonDereferencingOps, |
| bool replaceInDeallocOp) { |
| unsigned newMemRefRank = cast<MemRefType>(newMemRef.getType()).getRank(); |
| (void)newMemRefRank; // unused in opt mode |
| unsigned oldMemRefRank = cast<MemRefType>(oldMemRef.getType()).getRank(); |
| (void)oldMemRefRank; |
| if (indexRemap) { |
| assert(indexRemap.getNumSymbols() == symbolOperands.size() && |
| "symbol operand count mismatch"); |
| assert(indexRemap.getNumInputs() == |
| extraOperands.size() + oldMemRefRank + symbolOperands.size()); |
| assert(indexRemap.getNumResults() + extraIndices.size() == newMemRefRank); |
| } else { |
| assert(oldMemRefRank + extraIndices.size() == newMemRefRank); |
| } |
| |
| // Assert same elemental type. |
| assert(cast<MemRefType>(oldMemRef.getType()).getElementType() == |
| cast<MemRefType>(newMemRef.getType()).getElementType()); |
| |
| std::unique_ptr<DominanceInfo> domInfo; |
| std::unique_ptr<PostDominanceInfo> postDomInfo; |
| if (domOpFilter) |
| domInfo = std::make_unique<DominanceInfo>( |
| domOpFilter->getParentOfType<FunctionOpInterface>()); |
| |
| if (postDomOpFilter) |
| postDomInfo = std::make_unique<PostDominanceInfo>( |
| postDomOpFilter->getParentOfType<FunctionOpInterface>()); |
| |
| // Walk all uses of old memref; collect ops to perform replacement. We use a |
| // DenseSet since an operation could potentially have multiple uses of a |
| // memref (although rare), and the replacement later is going to erase ops. |
| DenseSet<Operation *> opsToReplace; |
| for (auto *op : oldMemRef.getUsers()) { |
| // Skip this use if it's not dominated by domOpFilter. |
| if (domOpFilter && !domInfo->dominates(domOpFilter, op)) |
| continue; |
| |
| // Skip this use if it's not post-dominated by postDomOpFilter. |
| if (postDomOpFilter && !postDomInfo->postDominates(postDomOpFilter, op)) |
| continue; |
| |
| // Skip dealloc's - no replacement is necessary, and a memref replacement |
| // at other uses doesn't hurt these dealloc's. |
| if (hasSingleEffect<MemoryEffects::Free>(op, oldMemRef) && |
| !replaceInDeallocOp) |
| continue; |
| |
| // Check if the memref was used in a non-dereferencing context. It is fine |
| // for the memref to be used in a non-dereferencing way outside of the |
| // region where this replacement is happening. |
| if (!isa<AffineMapAccessInterface>(*op)) { |
| if (!allowNonDereferencingOps) { |
| LLVM_DEBUG(llvm::dbgs() |
| << "Memref replacement failed: non-deferencing memref op: \n" |
| << *op << '\n'); |
| return failure(); |
| } |
| // Non-dereferencing ops with the MemRefsNormalizable trait are |
| // supported for replacement. |
| if (!op->hasTrait<OpTrait::MemRefsNormalizable>()) { |
| LLVM_DEBUG(llvm::dbgs() << "Memref replacement failed: use without a " |
| "memrefs normalizable trait: \n" |
| << *op << '\n'); |
| return failure(); |
| } |
| } |
| |
| // We'll first collect and then replace --- since replacement erases the op |
| // that has the use, and that op could be postDomFilter or domFilter itself! |
| opsToReplace.insert(op); |
| } |
| |
| for (auto *op : opsToReplace) { |
| if (failed(replaceAllMemRefUsesWith( |
| oldMemRef, newMemRef, op, extraIndices, indexRemap, extraOperands, |
| symbolOperands, allowNonDereferencingOps))) |
| llvm_unreachable("memref replacement guaranteed to succeed here"); |
| } |
| |
| return success(); |
| } |
| |
| /// Given an operation, inserts one or more single result affine |
| /// apply operations, results of which are exclusively used by this operation |
| /// operation. The operands of these newly created affine apply ops are |
| /// guaranteed to be loop iterators or terminal symbols of a function. |
| /// |
| /// Before |
| /// |
| /// affine.for %i = 0 to #map(%N) |
| /// %idx = affine.apply (d0) -> (d0 mod 2) (%i) |
| /// "send"(%idx, %A, ...) |
| /// "compute"(%idx) |
| /// |
| /// After |
| /// |
| /// affine.for %i = 0 to #map(%N) |
| /// %idx = affine.apply (d0) -> (d0 mod 2) (%i) |
| /// "send"(%idx, %A, ...) |
| /// %idx_ = affine.apply (d0) -> (d0 mod 2) (%i) |
| /// "compute"(%idx_) |
| /// |
| /// This allows applying different transformations on send and compute (for eg. |
| /// different shifts/delays). |
| /// |
| /// Returns nullptr either if none of opInst's operands were the result of an |
| /// affine.apply and thus there was no affine computation slice to create, or if |
| /// all the affine.apply op's supplying operands to this opInst did not have any |
| /// uses besides this opInst; otherwise returns the list of affine.apply |
| /// operations created in output argument `sliceOps`. |
| void mlir::affine::createAffineComputationSlice( |
| Operation *opInst, SmallVectorImpl<AffineApplyOp> *sliceOps) { |
| // Collect all operands that are results of affine apply ops. |
| SmallVector<Value, 4> subOperands; |
| subOperands.reserve(opInst->getNumOperands()); |
| for (auto operand : opInst->getOperands()) |
| if (isa_and_nonnull<AffineApplyOp>(operand.getDefiningOp())) |
| subOperands.push_back(operand); |
| |
| // Gather sequence of AffineApplyOps reachable from 'subOperands'. |
| SmallVector<Operation *, 4> affineApplyOps; |
| getReachableAffineApplyOps(subOperands, affineApplyOps); |
| // Skip transforming if there are no affine maps to compose. |
| if (affineApplyOps.empty()) |
| return; |
| |
| // Check if all uses of the affine apply op's lie only in this op op, in |
| // which case there would be nothing to do. |
| bool localized = true; |
| for (auto *op : affineApplyOps) { |
| for (auto result : op->getResults()) { |
| for (auto *user : result.getUsers()) { |
| if (user != opInst) { |
| localized = false; |
| break; |
| } |
| } |
| } |
| } |
| if (localized) |
| return; |
| |
| OpBuilder builder(opInst); |
| SmallVector<Value, 4> composedOpOperands(subOperands); |
| auto composedMap = builder.getMultiDimIdentityMap(composedOpOperands.size()); |
| fullyComposeAffineMapAndOperands(&composedMap, &composedOpOperands); |
| |
| // Create an affine.apply for each of the map results. |
| sliceOps->reserve(composedMap.getNumResults()); |
| for (auto resultExpr : composedMap.getResults()) { |
| auto singleResMap = AffineMap::get(composedMap.getNumDims(), |
| composedMap.getNumSymbols(), resultExpr); |
| sliceOps->push_back(builder.create<AffineApplyOp>( |
| opInst->getLoc(), singleResMap, composedOpOperands)); |
| } |
| |
| // Construct the new operands that include the results from the composed |
| // affine apply op above instead of existing ones (subOperands). So, they |
| // differ from opInst's operands only for those operands in 'subOperands', for |
| // which they will be replaced by the corresponding one from 'sliceOps'. |
| SmallVector<Value, 4> newOperands(opInst->getOperands()); |
| for (Value &operand : newOperands) { |
| // Replace the subOperands from among the new operands. |
| unsigned j, f; |
| for (j = 0, f = subOperands.size(); j < f; j++) { |
| if (operand == subOperands[j]) |
| break; |
| } |
| if (j < subOperands.size()) |
| operand = (*sliceOps)[j]; |
| } |
| for (unsigned idx = 0, e = newOperands.size(); idx < e; idx++) |
| opInst->setOperand(idx, newOperands[idx]); |
| } |
| |
| /// Enum to set patterns of affine expr in tiled-layout map. |
| /// TileFloorDiv: <dim expr> div <tile size> |
| /// TileMod: <dim expr> mod <tile size> |
| /// TileNone: None of the above |
| /// Example: |
| /// #tiled_2d_128x256 = affine_map<(d0, d1) |
| /// -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)> |
| /// "d0 div 128" and "d1 div 256" ==> TileFloorDiv |
| /// "d0 mod 128" and "d1 mod 256" ==> TileMod |
| enum TileExprPattern { TileFloorDiv, TileMod, TileNone }; |
| |
| /// Check if `map` is a tiled layout. In the tiled layout, specific k dimensions |
| /// being floordiv'ed by respective tile sizes appeare in a mod with the same |
| /// tile sizes, and no other expression involves those k dimensions. This |
| /// function stores a vector of tuples (`tileSizePos`) including AffineExpr for |
| /// tile size, positions of corresponding `floordiv` and `mod`. If it is not a |
| /// tiled layout, an empty vector is returned. |
| static LogicalResult getTileSizePos( |
| AffineMap map, |
| SmallVectorImpl<std::tuple<AffineExpr, unsigned, unsigned>> &tileSizePos) { |
| // Create `floordivExprs` which is a vector of tuples including LHS and RHS of |
| // `floordiv` and its position in `map` output. |
| // Example: #tiled_2d_128x256 = affine_map<(d0, d1) |
| // -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)> |
| // In this example, `floordivExprs` includes {d0, 128, 0} and {d1, 256, 1}. |
| SmallVector<std::tuple<AffineExpr, AffineExpr, unsigned>, 4> floordivExprs; |
| unsigned pos = 0; |
| for (AffineExpr expr : map.getResults()) { |
| if (expr.getKind() == AffineExprKind::FloorDiv) { |
| AffineBinaryOpExpr binaryExpr = cast<AffineBinaryOpExpr>(expr); |
| if (isa<AffineConstantExpr>(binaryExpr.getRHS())) |
| floordivExprs.emplace_back( |
| std::make_tuple(binaryExpr.getLHS(), binaryExpr.getRHS(), pos)); |
| } |
| pos++; |
| } |
| // Not tiled layout if `floordivExprs` is empty. |
| if (floordivExprs.empty()) { |
| tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{}; |
| return success(); |
| } |
| |
| // Check if LHS of `floordiv` is used in LHS of `mod`. If not used, `map` is |
| // not tiled layout. |
| for (std::tuple<AffineExpr, AffineExpr, unsigned> fexpr : floordivExprs) { |
| AffineExpr floordivExprLHS = std::get<0>(fexpr); |
| AffineExpr floordivExprRHS = std::get<1>(fexpr); |
| unsigned floordivPos = std::get<2>(fexpr); |
| |
| // Walk affinexpr of `map` output except `fexpr`, and check if LHS and RHS |
| // of `fexpr` are used in LHS and RHS of `mod`. If LHS of `fexpr` is used |
| // other expr, the map is not tiled layout. Example of non tiled layout: |
| // affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 floordiv 256)> |
| // affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 128)> |
| // affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 mod 256, d2 mod |
| // 256)> |
| bool found = false; |
| pos = 0; |
| for (AffineExpr expr : map.getResults()) { |
| bool notTiled = false; |
| if (pos != floordivPos) { |
| expr.walk([&](AffineExpr e) { |
| if (e == floordivExprLHS) { |
| if (expr.getKind() == AffineExprKind::Mod) { |
| AffineBinaryOpExpr binaryExpr = cast<AffineBinaryOpExpr>(expr); |
| // If LHS and RHS of `mod` are the same with those of floordiv. |
| if (floordivExprLHS == binaryExpr.getLHS() && |
| floordivExprRHS == binaryExpr.getRHS()) { |
| // Save tile size (RHS of `mod`), and position of `floordiv` and |
| // `mod` if same expr with `mod` is not found yet. |
| if (!found) { |
| tileSizePos.emplace_back( |
| std::make_tuple(binaryExpr.getRHS(), floordivPos, pos)); |
| found = true; |
| } else { |
| // Non tiled layout: Have multilpe `mod` with the same LHS. |
| // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 |
| // mod 256, d2 mod 256)> |
| notTiled = true; |
| } |
| } else { |
| // Non tiled layout: RHS of `mod` is different from `floordiv`. |
| // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 |
| // mod 128)> |
| notTiled = true; |
| } |
| } else { |
| // Non tiled layout: LHS is the same, but not `mod`. |
| // eg. affine_map<(d0, d1, d2) -> (d0, d1, d2 floordiv 256, d2 |
| // floordiv 256)> |
| notTiled = true; |
| } |
| } |
| }); |
| } |
| if (notTiled) { |
| tileSizePos = SmallVector<std::tuple<AffineExpr, unsigned, unsigned>>{}; |
| return success(); |
| } |
| pos++; |
| } |
| } |
| return success(); |
| } |
| |
| /// Check if `dim` dimension of memrefType with `layoutMap` becomes dynamic |
| /// after normalization. Dimensions that include dynamic dimensions in the map |
| /// output will become dynamic dimensions. Return true if `dim` is dynamic |
| /// dimension. |
| /// |
| /// Example: |
| /// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)> |
| /// |
| /// If d1 is dynamic dimension, 2nd and 3rd dimension of map output are dynamic. |
| /// memref<4x?xf32, #map0> ==> memref<4x?x?xf32> |
| static bool |
| isNormalizedMemRefDynamicDim(unsigned dim, AffineMap layoutMap, |
| SmallVectorImpl<unsigned> &inMemrefTypeDynDims) { |
| AffineExpr expr = layoutMap.getResults()[dim]; |
| // Check if affine expr of the dimension includes dynamic dimension of input |
| // memrefType. |
| MLIRContext *context = layoutMap.getContext(); |
| return expr |
| .walk([&](AffineExpr e) { |
| if (isa<AffineDimExpr>(e) && |
| llvm::any_of(inMemrefTypeDynDims, [&](unsigned dim) { |
| return e == getAffineDimExpr(dim, context); |
| })) |
| return WalkResult::interrupt(); |
| return WalkResult::advance(); |
| }) |
| .wasInterrupted(); |
| } |
| |
| /// Create affine expr to calculate dimension size for a tiled-layout map. |
| static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput, |
| TileExprPattern pat) { |
| // Create map output for the patterns. |
| // "floordiv <tile size>" ==> "ceildiv <tile size>" |
| // "mod <tile size>" ==> "<tile size>" |
| AffineExpr newMapOutput; |
| AffineBinaryOpExpr binaryExpr = nullptr; |
| switch (pat) { |
| case TileExprPattern::TileMod: |
| binaryExpr = cast<AffineBinaryOpExpr>(oldMapOutput); |
| newMapOutput = binaryExpr.getRHS(); |
| break; |
| case TileExprPattern::TileFloorDiv: |
| binaryExpr = cast<AffineBinaryOpExpr>(oldMapOutput); |
| newMapOutput = getAffineBinaryOpExpr( |
| AffineExprKind::CeilDiv, binaryExpr.getLHS(), binaryExpr.getRHS()); |
| break; |
| default: |
| newMapOutput = oldMapOutput; |
| } |
| return newMapOutput; |
| } |
| |
| /// Create new maps to calculate each dimension size of `newMemRefType`, and |
| /// create `newDynamicSizes` from them by using AffineApplyOp. |
| /// |
| /// Steps for normalizing dynamic memrefs for a tiled layout map |
| /// Example: |
| /// #map0 = affine_map<(d0, d1) -> (d0, d1 floordiv 32, d1 mod 32)> |
| /// %0 = dim %arg0, %c1 :memref<4x?xf32> |
| /// %1 = alloc(%0) : memref<4x?xf32, #map0> |
| /// |
| /// (Before this function) |
| /// 1. Check if `map`(#map0) is a tiled layout using `getTileSizePos()`. Only |
| /// single layout map is supported. |
| /// |
| /// 2. Create normalized memrefType using `isNormalizedMemRefDynamicDim()`. It |
| /// is memref<4x?x?xf32> in the above example. |
| /// |
| /// (In this function) |
| /// 3. Create new maps to calculate each dimension of the normalized memrefType |
| /// using `createDimSizeExprForTiledLayout()`. In the tiled layout, the |
| /// dimension size can be calculated by replacing "floordiv <tile size>" with |
| /// "ceildiv <tile size>" and "mod <tile size>" with "<tile size>". |
| /// - New map in the above example |
| /// #map0 = affine_map<(d0, d1) -> (d0)> |
| /// #map1 = affine_map<(d0, d1) -> (d1 ceildiv 32)> |
| /// #map2 = affine_map<(d0, d1) -> (32)> |
| /// |
| /// 4. Create AffineApplyOp to apply the new maps. The output of AffineApplyOp |
| /// is used in dynamicSizes of new AllocOp. |
| /// %0 = dim %arg0, %c1 : memref<4x?xf32> |
| /// %c4 = arith.constant 4 : index |
| /// %1 = affine.apply #map1(%c4, %0) |
| /// %2 = affine.apply #map2(%c4, %0) |
| template <typename AllocLikeOp> |
| static void createNewDynamicSizes(MemRefType oldMemRefType, |
| MemRefType newMemRefType, AffineMap map, |
| AllocLikeOp *allocOp, OpBuilder b, |
| SmallVectorImpl<Value> &newDynamicSizes) { |
| // Create new input for AffineApplyOp. |
| SmallVector<Value, 4> inAffineApply; |
| ArrayRef<int64_t> oldMemRefShape = oldMemRefType.getShape(); |
| unsigned dynIdx = 0; |
| for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) { |
| if (oldMemRefShape[d] < 0) { |
| // Use dynamicSizes of allocOp for dynamic dimension. |
| inAffineApply.emplace_back(allocOp->getDynamicSizes()[dynIdx]); |
| dynIdx++; |
| } else { |
| // Create ConstantOp for static dimension. |
| auto constantAttr = b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]); |
| inAffineApply.emplace_back( |
| b.create<arith::ConstantOp>(allocOp->getLoc(), constantAttr)); |
| } |
| } |
| |
| // Create new map to calculate each dimension size of new memref for each |
| // original map output. Only for dynamic dimesion of `newMemRefType`. |
| unsigned newDimIdx = 0; |
| ArrayRef<int64_t> newMemRefShape = newMemRefType.getShape(); |
| SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos; |
| (void)getTileSizePos(map, tileSizePos); |
| for (AffineExpr expr : map.getResults()) { |
| if (newMemRefShape[newDimIdx] < 0) { |
| // Create new maps to calculate each dimension size of new memref. |
| enum TileExprPattern pat = TileExprPattern::TileNone; |
| for (auto pos : tileSizePos) { |
| if (newDimIdx == std::get<1>(pos)) |
| pat = TileExprPattern::TileFloorDiv; |
| else if (newDimIdx == std::get<2>(pos)) |
| pat = TileExprPattern::TileMod; |
| } |
| AffineExpr newMapOutput = createDimSizeExprForTiledLayout(expr, pat); |
| AffineMap newMap = |
| AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput); |
| Value affineApp = |
| b.create<AffineApplyOp>(allocOp->getLoc(), newMap, inAffineApply); |
| newDynamicSizes.emplace_back(affineApp); |
| } |
| newDimIdx++; |
| } |
| } |
| |
| // TODO: Currently works for static memrefs with a single layout map. |
| template <typename AllocLikeOp> |
| LogicalResult mlir::affine::normalizeMemRef(AllocLikeOp *allocOp) { |
| MemRefType memrefType = allocOp->getType(); |
| OpBuilder b(*allocOp); |
| |
| // Fetch a new memref type after normalizing the old memref to have an |
| // identity map layout. |
| MemRefType newMemRefType = normalizeMemRefType(memrefType); |
| if (newMemRefType == memrefType) |
| // Either memrefType already had an identity map or the map couldn't be |
| // transformed to an identity map. |
| return failure(); |
| |
| Value oldMemRef = allocOp->getResult(); |
| |
| SmallVector<Value, 4> symbolOperands(allocOp->getSymbolOperands()); |
| AffineMap layoutMap = memrefType.getLayout().getAffineMap(); |
| AllocLikeOp newAlloc; |
| // Check if `layoutMap` is a tiled layout. Only single layout map is |
| // supported for normalizing dynamic memrefs. |
| SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos; |
| (void)getTileSizePos(layoutMap, tileSizePos); |
| if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) { |
| MemRefType oldMemRefType = cast<MemRefType>(oldMemRef.getType()); |
| SmallVector<Value, 4> newDynamicSizes; |
| createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b, |
| newDynamicSizes); |
| // Add the new dynamic sizes in new AllocOp. |
| newAlloc = |
| b.create<AllocLikeOp>(allocOp->getLoc(), newMemRefType, newDynamicSizes, |
| allocOp->getAlignmentAttr()); |
| } else { |
| newAlloc = b.create<AllocLikeOp>(allocOp->getLoc(), newMemRefType, |
| allocOp->getAlignmentAttr()); |
| } |
| // Replace all uses of the old memref. |
| if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc, |
| /*extraIndices=*/{}, |
| /*indexRemap=*/layoutMap, |
| /*extraOperands=*/{}, |
| /*symbolOperands=*/symbolOperands, |
| /*domOpFilter=*/nullptr, |
| /*postDomOpFilter=*/nullptr, |
| /*allowNonDereferencingOps=*/true))) { |
| // If it failed (due to escapes for example), bail out. |
| newAlloc.erase(); |
| return failure(); |
| } |
| // Replace any uses of the original alloc op and erase it. All remaining uses |
| // have to be dealloc's; RAMUW above would've failed otherwise. |
| assert(llvm::all_of(oldMemRef.getUsers(), [&](Operation *op) { |
| return hasSingleEffect<MemoryEffects::Free>(op, oldMemRef); |
| })); |
| oldMemRef.replaceAllUsesWith(newAlloc); |
| allocOp->erase(); |
| return success(); |
| } |
| |
| template LogicalResult |
| mlir::affine::normalizeMemRef<memref::AllocaOp>(memref::AllocaOp *op); |
| template LogicalResult |
| mlir::affine::normalizeMemRef<memref::AllocOp>(memref::AllocOp *op); |
| |
| MemRefType mlir::affine::normalizeMemRefType(MemRefType memrefType) { |
| unsigned rank = memrefType.getRank(); |
| if (rank == 0) |
| return memrefType; |
| |
| if (memrefType.getLayout().isIdentity()) { |
| // Either no maps is associated with this memref or this memref has |
| // a trivial (identity) map. |
| return memrefType; |
| } |
| AffineMap layoutMap = memrefType.getLayout().getAffineMap(); |
| unsigned numSymbolicOperands = layoutMap.getNumSymbols(); |
| |
| // We don't do any checks for one-to-one'ness; we assume that it is |
| // one-to-one. |
| |
| // Normalize only static memrefs and dynamic memrefs with a tiled-layout map |
| // for now. |
| // TODO: Normalize the other types of dynamic memrefs. |
| SmallVector<std::tuple<AffineExpr, unsigned, unsigned>> tileSizePos; |
| (void)getTileSizePos(layoutMap, tileSizePos); |
| if (memrefType.getNumDynamicDims() > 0 && tileSizePos.empty()) |
| return memrefType; |
| |
| // We have a single map that is not an identity map. Create a new memref |
| // with the right shape and an identity layout map. |
| ArrayRef<int64_t> shape = memrefType.getShape(); |
| // FlatAffineValueConstraint may later on use symbolicOperands. |
| FlatAffineValueConstraints fac(rank, numSymbolicOperands); |
| SmallVector<unsigned, 4> memrefTypeDynDims; |
| for (unsigned d = 0; d < rank; ++d) { |
| // Use constraint system only in static dimensions. |
| if (shape[d] > 0) { |
| fac.addBound(BoundType::LB, d, 0); |
| fac.addBound(BoundType::UB, d, shape[d] - 1); |
| } else { |
| memrefTypeDynDims.emplace_back(d); |
| } |
| } |
| // We compose this map with the original index (logical) space to derive |
| // the upper bounds for the new index space. |
| unsigned newRank = layoutMap.getNumResults(); |
| if (failed(fac.composeMatchingMap(layoutMap))) |
| return memrefType; |
| // TODO: Handle semi-affine maps. |
| // Project out the old data dimensions. |
| fac.projectOut(newRank, fac.getNumVars() - newRank - fac.getNumLocalVars()); |
| SmallVector<int64_t, 4> newShape(newRank); |
| MLIRContext *context = memrefType.getContext(); |
| for (unsigned d = 0; d < newRank; ++d) { |
| // Check if this dimension is dynamic. |
| if (isNormalizedMemRefDynamicDim(d, layoutMap, memrefTypeDynDims)) { |
| newShape[d] = ShapedType::kDynamic; |
| continue; |
| } |
| // The lower bound for the shape is always zero. |
| std::optional<int64_t> ubConst = fac.getConstantBound64(BoundType::UB, d); |
| // For a static memref and an affine map with no symbols, this is |
| // always bounded. However, when we have symbols, we may not be able to |
| // obtain a constant upper bound. Also, mapping to a negative space is |
| // invalid for normalization. |
| if (!ubConst.has_value() || *ubConst < 0) { |
| LLVM_DEBUG(llvm::dbgs() |
| << "can't normalize map due to unknown/invalid upper bound"); |
| return memrefType; |
| } |
| // If dimension of new memrefType is dynamic, the value is -1. |
| newShape[d] = *ubConst + 1; |
| } |
| |
| // Create the new memref type after trivializing the old layout map. |
| auto newMemRefType = |
| MemRefType::Builder(memrefType) |
| .setShape(newShape) |
| .setLayout(AffineMapAttr::get( |
| AffineMap::getMultiDimIdentityMap(newRank, context))); |
| return newMemRefType; |
| } |
| |
| DivModValue mlir::affine::getDivMod(OpBuilder &b, Location loc, Value lhs, |
| Value rhs) { |
| DivModValue result; |
| AffineExpr d0, d1; |
| bindDims(b.getContext(), d0, d1); |
| result.quotient = |
| affine::makeComposedAffineApply(b, loc, d0.floorDiv(d1), {lhs, rhs}); |
| result.remainder = |
| affine::makeComposedAffineApply(b, loc, d0 % d1, {lhs, rhs}); |
| return result; |
| } |
| |
| /// Create an affine map that computes `lhs` * `rhs`, composing in any other |
| /// affine maps. |
| static FailureOr<OpFoldResult> composedAffineMultiply(OpBuilder &b, |
| Location loc, |
| OpFoldResult lhs, |
| OpFoldResult rhs) { |
| AffineExpr s0, s1; |
| bindSymbols(b.getContext(), s0, s1); |
| return makeComposedFoldedAffineApply(b, loc, s0 * s1, {lhs, rhs}); |
| } |
| |
| FailureOr<SmallVector<Value>> |
| mlir::affine::delinearizeIndex(OpBuilder &b, Location loc, Value linearIndex, |
| ArrayRef<Value> basis, bool hasOuterBound) { |
| if (hasOuterBound) |
| basis = basis.drop_front(); |
| |
| // Note: the divisors are backwards due to the scan. |
| SmallVector<Value> divisors; |
| OpFoldResult basisProd = b.getIndexAttr(1); |
| for (OpFoldResult basisElem : llvm::reverse(basis)) { |
| FailureOr<OpFoldResult> nextProd = |
| composedAffineMultiply(b, loc, basisElem, basisProd); |
| if (failed(nextProd)) |
| return failure(); |
| basisProd = *nextProd; |
| divisors.push_back(getValueOrCreateConstantIndexOp(b, loc, basisProd)); |
| } |
| |
| SmallVector<Value> results; |
| results.reserve(divisors.size() + 1); |
| Value residual = linearIndex; |
| for (Value divisor : llvm::reverse(divisors)) { |
| DivModValue divMod = getDivMod(b, loc, residual, divisor); |
| results.push_back(divMod.quotient); |
| residual = divMod.remainder; |
| } |
| results.push_back(residual); |
| return results; |
| } |
| |
| FailureOr<SmallVector<Value>> |
| mlir::affine::delinearizeIndex(OpBuilder &b, Location loc, Value linearIndex, |
| ArrayRef<OpFoldResult> basis, |
| bool hasOuterBound) { |
| if (hasOuterBound) |
| basis = basis.drop_front(); |
| |
| // Note: the divisors are backwards due to the scan. |
| SmallVector<Value> divisors; |
| OpFoldResult basisProd = b.getIndexAttr(1); |
| for (OpFoldResult basisElem : llvm::reverse(basis)) { |
| FailureOr<OpFoldResult> nextProd = |
| composedAffineMultiply(b, loc, basisElem, basisProd); |
| if (failed(nextProd)) |
| return failure(); |
| basisProd = *nextProd; |
| divisors.push_back(getValueOrCreateConstantIndexOp(b, loc, basisProd)); |
| } |
| |
| SmallVector<Value> results; |
| results.reserve(divisors.size() + 1); |
| Value residual = linearIndex; |
| for (Value divisor : llvm::reverse(divisors)) { |
| DivModValue divMod = getDivMod(b, loc, residual, divisor); |
| results.push_back(divMod.quotient); |
| residual = divMod.remainder; |
| } |
| results.push_back(residual); |
| return results; |
| } |
| |
| OpFoldResult mlir::affine::linearizeIndex(ArrayRef<OpFoldResult> multiIndex, |
| ArrayRef<OpFoldResult> basis, |
| ImplicitLocOpBuilder &builder) { |
| return linearizeIndex(builder, builder.getLoc(), multiIndex, basis); |
| } |
| |
| OpFoldResult mlir::affine::linearizeIndex(OpBuilder &builder, Location loc, |
| ArrayRef<OpFoldResult> multiIndex, |
| ArrayRef<OpFoldResult> basis) { |
| assert(multiIndex.size() == basis.size() || |
| multiIndex.size() == basis.size() + 1); |
| SmallVector<AffineExpr> basisAffine; |
| |
| // Add a fake initial size in order to make the later index linearization |
| // computations line up if an outer bound is not provided. |
| if (multiIndex.size() == basis.size() + 1) |
| basisAffine.push_back(getAffineConstantExpr(1, builder.getContext())); |
| |
| for (size_t i = 0; i < basis.size(); ++i) { |
| basisAffine.push_back(getAffineSymbolExpr(i, builder.getContext())); |
| } |
| |
| SmallVector<AffineExpr> stridesAffine = computeStrides(basisAffine); |
| SmallVector<OpFoldResult> strides; |
| strides.reserve(stridesAffine.size()); |
| llvm::transform(stridesAffine, std::back_inserter(strides), |
| [&builder, &basis, loc](AffineExpr strideExpr) { |
| return affine::makeComposedFoldedAffineApply( |
| builder, loc, strideExpr, basis); |
| }); |
| |
| auto &&[linearIndexExpr, multiIndexAndStrides] = computeLinearIndex( |
| OpFoldResult(builder.getIndexAttr(0)), strides, multiIndex); |
| return affine::makeComposedFoldedAffineApply(builder, loc, linearIndexExpr, |
| multiIndexAndStrides); |
| } |