| //===- SparseVectorization.cpp - Vectorization of sparsified loops --------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // A pass that converts loops generated by the sparsifier into a form that |
| // can exploit SIMD instructions of the target architecture. Note that this pass |
| // ensures the sparsifier can generate efficient SIMD (including ArmSVE |
| // support) with proper separation of concerns as far as sparsification and |
| // vectorization is concerned. However, this pass is not the final abstraction |
| // level we want, and not the general vectorizer we want either. It forms a good |
| // stepping stone for incremental future improvements though. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "Utils/CodegenUtils.h" |
| #include "Utils/LoopEmitter.h" |
| |
| #include "mlir/Dialect/Affine/IR/AffineOps.h" |
| #include "mlir/Dialect/Arith/IR/Arith.h" |
| #include "mlir/Dialect/Complex/IR/Complex.h" |
| #include "mlir/Dialect/Math/IR/Math.h" |
| #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| #include "mlir/Dialect/SCF/IR/SCF.h" |
| #include "mlir/Dialect/SparseTensor/Transforms/Passes.h" |
| #include "mlir/Dialect/Vector/IR/VectorOps.h" |
| #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" |
| #include "mlir/IR/Matchers.h" |
| |
| using namespace mlir; |
| using namespace mlir::sparse_tensor; |
| |
| namespace { |
| |
| /// Target SIMD properties: |
| /// vectorLength: # packed data elements (viz. vector<16xf32> has length 16) |
| /// enableVLAVectorization: enables scalable vectors (viz. ARMSve) |
| /// enableSIMDIndex32: uses 32-bit indices in gather/scatter for efficiency |
| struct VL { |
| unsigned vectorLength; |
| bool enableVLAVectorization; |
| bool enableSIMDIndex32; |
| }; |
| |
| /// Helper test for invariant value (defined outside given block). |
| static bool isInvariantValue(Value val, Block *block) { |
| return val.getDefiningOp() && val.getDefiningOp()->getBlock() != block; |
| } |
| |
| /// Helper test for invariant argument (defined outside given block). |
| static bool isInvariantArg(BlockArgument arg, Block *block) { |
| return arg.getOwner() != block; |
| } |
| |
| /// Constructs vector type for element type. |
| static VectorType vectorType(VL vl, Type etp) { |
| return VectorType::get(vl.vectorLength, etp, vl.enableVLAVectorization); |
| } |
| |
| /// Constructs vector type from a memref value. |
| static VectorType vectorType(VL vl, Value mem) { |
| return vectorType(vl, getMemRefType(mem).getElementType()); |
| } |
| |
| /// Constructs vector iteration mask. |
| static Value genVectorMask(PatternRewriter &rewriter, Location loc, VL vl, |
| Value iv, Value lo, Value hi, Value step) { |
| VectorType mtp = vectorType(vl, rewriter.getI1Type()); |
| // Special case if the vector length evenly divides the trip count (for |
| // example, "for i = 0, 128, 16"). A constant all-true mask is generated |
| // so that all subsequent masked memory operations are immediately folded |
| // into unconditional memory operations. |
| IntegerAttr loInt, hiInt, stepInt; |
| if (matchPattern(lo, m_Constant(&loInt)) && |
| matchPattern(hi, m_Constant(&hiInt)) && |
| matchPattern(step, m_Constant(&stepInt))) { |
| if (((hiInt.getInt() - loInt.getInt()) % stepInt.getInt()) == 0) { |
| Value trueVal = constantI1(rewriter, loc, true); |
| return rewriter.create<vector::BroadcastOp>(loc, mtp, trueVal); |
| } |
| } |
| // Otherwise, generate a vector mask that avoids overrunning the upperbound |
| // during vector execution. Here we rely on subsequent loop optimizations to |
| // avoid executing the mask in all iterations, for example, by splitting the |
| // loop into an unconditional vector loop and a scalar cleanup loop. |
| auto min = AffineMap::get( |
| /*dimCount=*/2, /*symbolCount=*/1, |
| {rewriter.getAffineSymbolExpr(0), |
| rewriter.getAffineDimExpr(0) - rewriter.getAffineDimExpr(1)}, |
| rewriter.getContext()); |
| Value end = rewriter.createOrFold<affine::AffineMinOp>( |
| loc, min, ValueRange{hi, iv, step}); |
| return rewriter.create<vector::CreateMaskOp>(loc, mtp, end); |
| } |
| |
| /// Generates a vectorized invariant. Here we rely on subsequent loop |
| /// optimizations to hoist the invariant broadcast out of the vector loop. |
| static Value genVectorInvariantValue(PatternRewriter &rewriter, VL vl, |
| Value val) { |
| VectorType vtp = vectorType(vl, val.getType()); |
| return rewriter.create<vector::BroadcastOp>(val.getLoc(), vtp, val); |
| } |
| |
| /// Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi], |
| /// where 'lo' denotes the current index and 'hi = lo + vl - 1'. Note |
| /// that the sparsifier can only generate indirect loads in |
| /// the last index, i.e. back(). |
| static Value genVectorLoad(PatternRewriter &rewriter, Location loc, VL vl, |
| Value mem, ArrayRef<Value> idxs, Value vmask) { |
| VectorType vtp = vectorType(vl, mem); |
| Value pass = constantZero(rewriter, loc, vtp); |
| if (llvm::isa<VectorType>(idxs.back().getType())) { |
| SmallVector<Value> scalarArgs(idxs); |
| Value indexVec = idxs.back(); |
| scalarArgs.back() = constantIndex(rewriter, loc, 0); |
| return rewriter.create<vector::GatherOp>(loc, vtp, mem, scalarArgs, |
| indexVec, vmask, pass); |
| } |
| return rewriter.create<vector::MaskedLoadOp>(loc, vtp, mem, idxs, vmask, |
| pass); |
| } |
| |
| /// Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs |
| /// where 'lo' denotes the current index and 'hi = lo + vl - 1'. Note |
| /// that the sparsifier can only generate indirect stores in |
| /// the last index, i.e. back(). |
| static void genVectorStore(PatternRewriter &rewriter, Location loc, Value mem, |
| ArrayRef<Value> idxs, Value vmask, Value rhs) { |
| if (llvm::isa<VectorType>(idxs.back().getType())) { |
| SmallVector<Value> scalarArgs(idxs); |
| Value indexVec = idxs.back(); |
| scalarArgs.back() = constantIndex(rewriter, loc, 0); |
| rewriter.create<vector::ScatterOp>(loc, mem, scalarArgs, indexVec, vmask, |
| rhs); |
| return; |
| } |
| rewriter.create<vector::MaskedStoreOp>(loc, mem, idxs, vmask, rhs); |
| } |
| |
| /// Detects a vectorizable reduction operations and returns the |
| /// combining kind of reduction on success in `kind`. |
| static bool isVectorizableReduction(Value red, Value iter, |
| vector::CombiningKind &kind) { |
| if (auto addf = red.getDefiningOp<arith::AddFOp>()) { |
| kind = vector::CombiningKind::ADD; |
| return addf->getOperand(0) == iter || addf->getOperand(1) == iter; |
| } |
| if (auto addi = red.getDefiningOp<arith::AddIOp>()) { |
| kind = vector::CombiningKind::ADD; |
| return addi->getOperand(0) == iter || addi->getOperand(1) == iter; |
| } |
| if (auto subf = red.getDefiningOp<arith::SubFOp>()) { |
| kind = vector::CombiningKind::ADD; |
| return subf->getOperand(0) == iter; |
| } |
| if (auto subi = red.getDefiningOp<arith::SubIOp>()) { |
| kind = vector::CombiningKind::ADD; |
| return subi->getOperand(0) == iter; |
| } |
| if (auto mulf = red.getDefiningOp<arith::MulFOp>()) { |
| kind = vector::CombiningKind::MUL; |
| return mulf->getOperand(0) == iter || mulf->getOperand(1) == iter; |
| } |
| if (auto muli = red.getDefiningOp<arith::MulIOp>()) { |
| kind = vector::CombiningKind::MUL; |
| return muli->getOperand(0) == iter || muli->getOperand(1) == iter; |
| } |
| if (auto andi = red.getDefiningOp<arith::AndIOp>()) { |
| kind = vector::CombiningKind::AND; |
| return andi->getOperand(0) == iter || andi->getOperand(1) == iter; |
| } |
| if (auto ori = red.getDefiningOp<arith::OrIOp>()) { |
| kind = vector::CombiningKind::OR; |
| return ori->getOperand(0) == iter || ori->getOperand(1) == iter; |
| } |
| if (auto xori = red.getDefiningOp<arith::XOrIOp>()) { |
| kind = vector::CombiningKind::XOR; |
| return xori->getOperand(0) == iter || xori->getOperand(1) == iter; |
| } |
| return false; |
| } |
| |
| /// Generates an initial value for a vector reduction, following the scheme |
| /// given in Chapter 5 of "The Software Vectorization Handbook", where the |
| /// initial scalar value is correctly embedded in the vector reduction value, |
| /// and a straightforward horizontal reduction will complete the operation. |
| /// Value 'r' denotes the initial value of the reduction outside the loop. |
| static Value genVectorReducInit(PatternRewriter &rewriter, Location loc, |
| Value red, Value iter, Value r, |
| VectorType vtp) { |
| vector::CombiningKind kind; |
| if (!isVectorizableReduction(red, iter, kind)) |
| llvm_unreachable("unknown reduction"); |
| switch (kind) { |
| case vector::CombiningKind::ADD: |
| case vector::CombiningKind::XOR: |
| // Initialize reduction vector to: | 0 | .. | 0 | r | |
| return rewriter.create<vector::InsertOp>(loc, r, |
| constantZero(rewriter, loc, vtp), |
| constantIndex(rewriter, loc, 0)); |
| case vector::CombiningKind::MUL: |
| // Initialize reduction vector to: | 1 | .. | 1 | r | |
| return rewriter.create<vector::InsertOp>(loc, r, |
| constantOne(rewriter, loc, vtp), |
| constantIndex(rewriter, loc, 0)); |
| case vector::CombiningKind::AND: |
| case vector::CombiningKind::OR: |
| // Initialize reduction vector to: | r | .. | r | r | |
| return rewriter.create<vector::BroadcastOp>(loc, vtp, r); |
| default: |
| break; |
| } |
| llvm_unreachable("unknown reduction kind"); |
| } |
| |
| /// This method is called twice to analyze and rewrite the given subscripts. |
| /// The first call (!codegen) does the analysis. Then, on success, the second |
| /// call (codegen) yields the proper vector form in the output parameter |
| /// vector 'idxs'. This mechanism ensures that analysis and rewriting code |
| /// stay in sync. Note that the analyis part is simple because the sparsifier |
| /// only generates relatively simple subscript expressions. |
| /// |
| /// See https://llvm.org/docs/GetElementPtr.html for some background on |
| /// the complications described below. |
| /// |
| /// We need to generate a position/coordinate load from the sparse storage |
| /// scheme. Narrower data types need to be zero extended before casting |
| /// the value into the `index` type used for looping and indexing. |
| /// |
| /// For the scalar case, subscripts simply zero extend narrower indices |
| /// into 64-bit values before casting to an index type without a performance |
| /// penalty. Indices that already are 64-bit, in theory, cannot express the |
| /// full range since the LLVM backend defines addressing in terms of an |
| /// unsigned pointer/signed index pair. |
| static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp, |
| VL vl, ValueRange subs, bool codegen, |
| Value vmask, SmallVectorImpl<Value> &idxs) { |
| unsigned d = 0; |
| unsigned dim = subs.size(); |
| Block *block = &forOp.getRegion().front(); |
| for (auto sub : subs) { |
| bool innermost = ++d == dim; |
| // Invariant subscripts in outer dimensions simply pass through. |
| // Note that we rely on LICM to hoist loads where all subscripts |
| // are invariant in the innermost loop. |
| // Example: |
| // a[inv][i] for inv |
| if (isInvariantValue(sub, block)) { |
| if (innermost) |
| return false; |
| if (codegen) |
| idxs.push_back(sub); |
| continue; // success so far |
| } |
| // Invariant block arguments (including outer loop indices) in outer |
| // dimensions simply pass through. Direct loop indices in the |
| // innermost loop simply pass through as well. |
| // Example: |
| // a[i][j] for both i and j |
| if (auto arg = llvm::dyn_cast<BlockArgument>(sub)) { |
| if (isInvariantArg(arg, block) == innermost) |
| return false; |
| if (codegen) |
| idxs.push_back(sub); |
| continue; // success so far |
| } |
| // Look under the hood of casting. |
| auto cast = sub; |
| while (true) { |
| if (auto icast = cast.getDefiningOp<arith::IndexCastOp>()) |
| cast = icast->getOperand(0); |
| else if (auto ecast = cast.getDefiningOp<arith::ExtUIOp>()) |
| cast = ecast->getOperand(0); |
| else |
| break; |
| } |
| // Since the index vector is used in a subsequent gather/scatter |
| // operations, which effectively defines an unsigned pointer + signed |
| // index, we must zero extend the vector to an index width. For 8-bit |
| // and 16-bit values, an 32-bit index width suffices. For 32-bit values, |
| // zero extending the elements into 64-bit loses some performance since |
| // the 32-bit indexed gather/scatter is more efficient than the 64-bit |
| // index variant (if the negative 32-bit index space is unused, the |
| // enableSIMDIndex32 flag can preserve this performance). For 64-bit |
| // values, there is no good way to state that the indices are unsigned, |
| // which creates the potential of incorrect address calculations in the |
| // unlikely case we need such extremely large offsets. |
| // Example: |
| // a[ ind[i] ] |
| if (auto load = cast.getDefiningOp<memref::LoadOp>()) { |
| if (!innermost) |
| return false; |
| if (codegen) { |
| SmallVector<Value> idxs2(load.getIndices()); // no need to analyze |
| Location loc = forOp.getLoc(); |
| Value vload = |
| genVectorLoad(rewriter, loc, vl, load.getMemRef(), idxs2, vmask); |
| Type etp = llvm::cast<VectorType>(vload.getType()).getElementType(); |
| if (!llvm::isa<IndexType>(etp)) { |
| if (etp.getIntOrFloatBitWidth() < 32) |
| vload = rewriter.create<arith::ExtUIOp>( |
| loc, vectorType(vl, rewriter.getI32Type()), vload); |
| else if (etp.getIntOrFloatBitWidth() < 64 && !vl.enableSIMDIndex32) |
| vload = rewriter.create<arith::ExtUIOp>( |
| loc, vectorType(vl, rewriter.getI64Type()), vload); |
| } |
| idxs.push_back(vload); |
| } |
| continue; // success so far |
| } |
| // Address calculation 'i = add inv, idx' (after LICM). |
| // Example: |
| // a[base + i] |
| if (auto load = cast.getDefiningOp<arith::AddIOp>()) { |
| Value inv = load.getOperand(0); |
| Value idx = load.getOperand(1); |
| // Swap non-invariant. |
| if (!isInvariantValue(inv, block)) { |
| inv = idx; |
| idx = load.getOperand(0); |
| } |
| // Inspect. |
| if (isInvariantValue(inv, block)) { |
| if (auto arg = llvm::dyn_cast<BlockArgument>(idx)) { |
| if (isInvariantArg(arg, block) || !innermost) |
| return false; |
| if (codegen) |
| idxs.push_back( |
| rewriter.create<arith::AddIOp>(forOp.getLoc(), inv, idx)); |
| continue; // success so far |
| } |
| } |
| } |
| return false; |
| } |
| return true; |
| } |
| |
| #define UNAOP(xxx) \ |
| if (isa<xxx>(def)) { \ |
| if (codegen) \ |
| vexp = rewriter.create<xxx>(loc, vx); \ |
| return true; \ |
| } |
| |
| #define TYPEDUNAOP(xxx) \ |
| if (auto x = dyn_cast<xxx>(def)) { \ |
| if (codegen) { \ |
| VectorType vtp = vectorType(vl, x.getType()); \ |
| vexp = rewriter.create<xxx>(loc, vtp, vx); \ |
| } \ |
| return true; \ |
| } |
| |
| #define BINOP(xxx) \ |
| if (isa<xxx>(def)) { \ |
| if (codegen) \ |
| vexp = rewriter.create<xxx>(loc, vx, vy); \ |
| return true; \ |
| } |
| |
| /// This method is called twice to analyze and rewrite the given expression. |
| /// The first call (!codegen) does the analysis. Then, on success, the second |
| /// call (codegen) yields the proper vector form in the output parameter 'vexp'. |
| /// This mechanism ensures that analysis and rewriting code stay in sync. Note |
| /// that the analyis part is simple because the sparsifier only generates |
| /// relatively simple expressions inside the for-loops. |
| static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl, |
| Value exp, bool codegen, Value vmask, Value &vexp) { |
| Location loc = forOp.getLoc(); |
| // Reject unsupported types. |
| if (!VectorType::isValidElementType(exp.getType())) |
| return false; |
| // A block argument is invariant/reduction/index. |
| if (auto arg = llvm::dyn_cast<BlockArgument>(exp)) { |
| if (arg == forOp.getInductionVar()) { |
| // We encountered a single, innermost index inside the computation, |
| // such as a[i] = i, which must convert to [i, i+1, ...]. |
| if (codegen) { |
| VectorType vtp = vectorType(vl, arg.getType()); |
| Value veci = rewriter.create<vector::BroadcastOp>(loc, vtp, arg); |
| Value incr = rewriter.create<vector::StepOp>(loc, vtp); |
| vexp = rewriter.create<arith::AddIOp>(loc, veci, incr); |
| } |
| return true; |
| } |
| // An invariant or reduction. In both cases, we treat this as an |
| // invariant value, and rely on later replacing and folding to |
| // construct a proper reduction chain for the latter case. |
| if (codegen) |
| vexp = genVectorInvariantValue(rewriter, vl, exp); |
| return true; |
| } |
| // Something defined outside the loop-body is invariant. |
| Operation *def = exp.getDefiningOp(); |
| Block *block = &forOp.getRegion().front(); |
| if (def->getBlock() != block) { |
| if (codegen) |
| vexp = genVectorInvariantValue(rewriter, vl, exp); |
| return true; |
| } |
| // Proper load operations. These are either values involved in the |
| // actual computation, such as a[i] = b[i] becomes a[lo:hi] = b[lo:hi], |
| // or coordinate values inside the computation that are now fetched from |
| // the sparse storage coordinates arrays, such as a[i] = i becomes |
| // a[lo:hi] = ind[lo:hi], where 'lo' denotes the current index |
| // and 'hi = lo + vl - 1'. |
| if (auto load = dyn_cast<memref::LoadOp>(def)) { |
| auto subs = load.getIndices(); |
| SmallVector<Value> idxs; |
| if (vectorizeSubscripts(rewriter, forOp, vl, subs, codegen, vmask, idxs)) { |
| if (codegen) |
| vexp = genVectorLoad(rewriter, loc, vl, load.getMemRef(), idxs, vmask); |
| return true; |
| } |
| return false; |
| } |
| // Inside loop-body unary and binary operations. Note that it would be |
| // nicer if we could somehow test and build the operations in a more |
| // concise manner than just listing them all (although this way we know |
| // for certain that they can vectorize). |
| // |
| // TODO: avoid visiting CSEs multiple times |
| // |
| if (def->getNumOperands() == 1) { |
| Value vx; |
| if (vectorizeExpr(rewriter, forOp, vl, def->getOperand(0), codegen, vmask, |
| vx)) { |
| UNAOP(math::AbsFOp) |
| UNAOP(math::AbsIOp) |
| UNAOP(math::CeilOp) |
| UNAOP(math::FloorOp) |
| UNAOP(math::SqrtOp) |
| UNAOP(math::ExpM1Op) |
| UNAOP(math::Log1pOp) |
| UNAOP(math::SinOp) |
| UNAOP(math::TanhOp) |
| UNAOP(arith::NegFOp) |
| TYPEDUNAOP(arith::TruncFOp) |
| TYPEDUNAOP(arith::ExtFOp) |
| TYPEDUNAOP(arith::FPToSIOp) |
| TYPEDUNAOP(arith::FPToUIOp) |
| TYPEDUNAOP(arith::SIToFPOp) |
| TYPEDUNAOP(arith::UIToFPOp) |
| TYPEDUNAOP(arith::ExtSIOp) |
| TYPEDUNAOP(arith::ExtUIOp) |
| TYPEDUNAOP(arith::IndexCastOp) |
| TYPEDUNAOP(arith::TruncIOp) |
| TYPEDUNAOP(arith::BitcastOp) |
| // TODO: complex? |
| } |
| } else if (def->getNumOperands() == 2) { |
| Value vx, vy; |
| if (vectorizeExpr(rewriter, forOp, vl, def->getOperand(0), codegen, vmask, |
| vx) && |
| vectorizeExpr(rewriter, forOp, vl, def->getOperand(1), codegen, vmask, |
| vy)) { |
| // We only accept shift-by-invariant (where the same shift factor applies |
| // to all packed elements). In the vector dialect, this is still |
| // represented with an expanded vector at the right-hand-side, however, |
| // so that we do not have to special case the code generation. |
| if (isa<arith::ShLIOp>(def) || isa<arith::ShRUIOp>(def) || |
| isa<arith::ShRSIOp>(def)) { |
| Value shiftFactor = def->getOperand(1); |
| if (!isInvariantValue(shiftFactor, block)) |
| return false; |
| } |
| // Generate code. |
| BINOP(arith::MulFOp) |
| BINOP(arith::MulIOp) |
| BINOP(arith::DivFOp) |
| BINOP(arith::DivSIOp) |
| BINOP(arith::DivUIOp) |
| BINOP(arith::AddFOp) |
| BINOP(arith::AddIOp) |
| BINOP(arith::SubFOp) |
| BINOP(arith::SubIOp) |
| BINOP(arith::AndIOp) |
| BINOP(arith::OrIOp) |
| BINOP(arith::XOrIOp) |
| BINOP(arith::ShLIOp) |
| BINOP(arith::ShRUIOp) |
| BINOP(arith::ShRSIOp) |
| // TODO: complex? |
| } |
| } |
| return false; |
| } |
| |
| #undef UNAOP |
| #undef TYPEDUNAOP |
| #undef BINOP |
| |
| /// This method is called twice to analyze and rewrite the given for-loop. |
| /// The first call (!codegen) does the analysis. Then, on success, the second |
| /// call (codegen) rewriters the IR into vector form. This mechanism ensures |
| /// that analysis and rewriting code stay in sync. |
| static bool vectorizeStmt(PatternRewriter &rewriter, scf::ForOp forOp, VL vl, |
| bool codegen) { |
| Block &block = forOp.getRegion().front(); |
| // For loops with single yield statement (as below) could be generated |
| // when custom reduce is used with unary operation. |
| // for (...) |
| // yield c_0 |
| if (block.getOperations().size() <= 1) |
| return false; |
| |
| Location loc = forOp.getLoc(); |
| scf::YieldOp yield = cast<scf::YieldOp>(block.getTerminator()); |
| auto &last = *++block.rbegin(); |
| scf::ForOp forOpNew; |
| |
| // Perform initial set up during codegen (we know that the first analysis |
| // pass was successful). For reductions, we need to construct a completely |
| // new for-loop, since the incoming and outgoing reduction type |
| // changes into SIMD form. For stores, we can simply adjust the stride |
| // and insert in the existing for-loop. In both cases, we set up a vector |
| // mask for all operations which takes care of confining vectors to |
| // the original iteration space (later cleanup loops or other |
| // optimizations can take care of those). |
| Value vmask; |
| if (codegen) { |
| Value step = constantIndex(rewriter, loc, vl.vectorLength); |
| if (vl.enableVLAVectorization) { |
| Value vscale = |
| rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType()); |
| step = rewriter.create<arith::MulIOp>(loc, vscale, step); |
| } |
| if (!yield.getResults().empty()) { |
| Value init = forOp.getInitArgs()[0]; |
| VectorType vtp = vectorType(vl, init.getType()); |
| Value vinit = genVectorReducInit(rewriter, loc, yield->getOperand(0), |
| forOp.getRegionIterArg(0), init, vtp); |
| forOpNew = rewriter.create<scf::ForOp>( |
| loc, forOp.getLowerBound(), forOp.getUpperBound(), step, vinit); |
| forOpNew->setAttr( |
| LoopEmitter::getLoopEmitterLoopAttrName(), |
| forOp->getAttr(LoopEmitter::getLoopEmitterLoopAttrName())); |
| rewriter.setInsertionPointToStart(forOpNew.getBody()); |
| } else { |
| rewriter.modifyOpInPlace(forOp, [&]() { forOp.setStep(step); }); |
| rewriter.setInsertionPoint(yield); |
| } |
| vmask = genVectorMask(rewriter, loc, vl, forOp.getInductionVar(), |
| forOp.getLowerBound(), forOp.getUpperBound(), step); |
| } |
| |
| // Sparse for-loops either are terminated by a non-empty yield operation |
| // (reduction loop) or otherwise by a store operation (pararallel loop). |
| if (!yield.getResults().empty()) { |
| // Analyze/vectorize reduction. |
| if (yield->getNumOperands() != 1) |
| return false; |
| Value red = yield->getOperand(0); |
| Value iter = forOp.getRegionIterArg(0); |
| vector::CombiningKind kind; |
| Value vrhs; |
| if (isVectorizableReduction(red, iter, kind) && |
| vectorizeExpr(rewriter, forOp, vl, red, codegen, vmask, vrhs)) { |
| if (codegen) { |
| Value partial = forOpNew.getResult(0); |
| Value vpass = genVectorInvariantValue(rewriter, vl, iter); |
| Value vred = rewriter.create<arith::SelectOp>(loc, vmask, vrhs, vpass); |
| rewriter.create<scf::YieldOp>(loc, vred); |
| rewriter.setInsertionPointAfter(forOpNew); |
| Value vres = rewriter.create<vector::ReductionOp>(loc, kind, partial); |
| // Now do some relinking (last one is not completely type safe |
| // but all bad ones are removed right away). This also folds away |
| // nop broadcast operations. |
| rewriter.replaceAllUsesWith(forOp.getResult(0), vres); |
| rewriter.replaceAllUsesWith(forOp.getInductionVar(), |
| forOpNew.getInductionVar()); |
| rewriter.replaceAllUsesWith(forOp.getRegionIterArg(0), |
| forOpNew.getRegionIterArg(0)); |
| rewriter.eraseOp(forOp); |
| } |
| return true; |
| } |
| } else if (auto store = dyn_cast<memref::StoreOp>(last)) { |
| // Analyze/vectorize store operation. |
| auto subs = store.getIndices(); |
| SmallVector<Value> idxs; |
| Value rhs = store.getValue(); |
| Value vrhs; |
| if (vectorizeSubscripts(rewriter, forOp, vl, subs, codegen, vmask, idxs) && |
| vectorizeExpr(rewriter, forOp, vl, rhs, codegen, vmask, vrhs)) { |
| if (codegen) { |
| genVectorStore(rewriter, loc, store.getMemRef(), idxs, vmask, vrhs); |
| rewriter.eraseOp(store); |
| } |
| return true; |
| } |
| } |
| |
| assert(!codegen && "cannot call codegen when analysis failed"); |
| return false; |
| } |
| |
| /// Basic for-loop vectorizer. |
| struct ForOpRewriter : public OpRewritePattern<scf::ForOp> { |
| public: |
| using OpRewritePattern<scf::ForOp>::OpRewritePattern; |
| |
| ForOpRewriter(MLIRContext *context, unsigned vectorLength, |
| bool enableVLAVectorization, bool enableSIMDIndex32) |
| : OpRewritePattern(context), vl{vectorLength, enableVLAVectorization, |
| enableSIMDIndex32} {} |
| |
| LogicalResult matchAndRewrite(scf::ForOp op, |
| PatternRewriter &rewriter) const override { |
| // Check for single block, unit-stride for-loop that is generated by |
| // sparsifier, which means no data dependence analysis is required, |
| // and its loop-body is very restricted in form. |
| if (!op.getRegion().hasOneBlock() || !isOneInteger(op.getStep()) || |
| !op->hasAttr(LoopEmitter::getLoopEmitterLoopAttrName())) |
| return failure(); |
| // Analyze (!codegen) and rewrite (codegen) loop-body. |
| if (vectorizeStmt(rewriter, op, vl, /*codegen=*/false) && |
| vectorizeStmt(rewriter, op, vl, /*codegen=*/true)) |
| return success(); |
| return failure(); |
| } |
| |
| private: |
| const VL vl; |
| }; |
| |
| static LogicalResult cleanReducChain(PatternRewriter &rewriter, Operation *op, |
| Value inp) { |
| if (auto redOp = inp.getDefiningOp<vector::ReductionOp>()) { |
| if (auto forOp = redOp.getVector().getDefiningOp<scf::ForOp>()) { |
| if (forOp->hasAttr(LoopEmitter::getLoopEmitterLoopAttrName())) { |
| rewriter.replaceOp(op, redOp.getVector()); |
| return success(); |
| } |
| } |
| } |
| return failure(); |
| } |
| |
| /// Reduction chain cleanup. |
| /// v = for { } |
| /// s = vsum(v) v = for { } |
| /// u = broadcast(s) -> for (v) { } |
| /// for (u) { } |
| struct ReducChainBroadcastRewriter |
| : public OpRewritePattern<vector::BroadcastOp> { |
| public: |
| using OpRewritePattern<vector::BroadcastOp>::OpRewritePattern; |
| |
| LogicalResult matchAndRewrite(vector::BroadcastOp op, |
| PatternRewriter &rewriter) const override { |
| return cleanReducChain(rewriter, op, op.getSource()); |
| } |
| }; |
| |
| /// Reduction chain cleanup. |
| /// v = for { } |
| /// s = vsum(v) v = for { } |
| /// u = insert(s) -> for (v) { } |
| /// for (u) { } |
| struct ReducChainInsertRewriter : public OpRewritePattern<vector::InsertOp> { |
| public: |
| using OpRewritePattern<vector::InsertOp>::OpRewritePattern; |
| |
| LogicalResult matchAndRewrite(vector::InsertOp op, |
| PatternRewriter &rewriter) const override { |
| return cleanReducChain(rewriter, op, op.getValueToStore()); |
| } |
| }; |
| } // namespace |
| |
| //===----------------------------------------------------------------------===// |
| // Public method for populating vectorization rules. |
| //===----------------------------------------------------------------------===// |
| |
| /// Populates the given patterns list with vectorization rules. |
| void mlir::populateSparseVectorizationPatterns(RewritePatternSet &patterns, |
| unsigned vectorLength, |
| bool enableVLAVectorization, |
| bool enableSIMDIndex32) { |
| assert(vectorLength > 0); |
| vector::populateVectorStepLoweringPatterns(patterns); |
| patterns.add<ForOpRewriter>(patterns.getContext(), vectorLength, |
| enableVLAVectorization, enableSIMDIndex32); |
| patterns.add<ReducChainInsertRewriter, ReducChainBroadcastRewriter>( |
| patterns.getContext()); |
| } |