| //===- LoopEmitter.cpp ----------------------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "LoopEmitter.h" |
| #include "CodegenUtils.h" |
| |
| #include "mlir/Dialect/Arith/IR/Arith.h" |
| #include "mlir/Dialect/Bufferization/IR/Bufferization.h" |
| #include "mlir/Dialect/Linalg/IR/Linalg.h" |
| #include "mlir/Dialect/Linalg/Utils/Utils.h" |
| #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| #include "mlir/Dialect/SCF/IR/SCF.h" |
| #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h" |
| #include "mlir/Dialect/Tensor/IR/Tensor.h" |
| #include "mlir/Dialect/Vector/IR/VectorOps.h" |
| |
| using namespace mlir; |
| using namespace mlir::sparse_tensor; |
| |
| //===----------------------------------------------------------------------===// |
| // File local shorthand macros |
| //===----------------------------------------------------------------------===// |
| |
| #define CMPI(p, l, r) \ |
| (builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::p, (l), (r)) \ |
| .getResult()) |
| |
| #define C_IDX(v) (constantIndex(builder, loc, (v))) |
| #define YIELD(vs) (builder.create<scf::YieldOp>(loc, (vs))) |
| #define ADDI(lhs, rhs) (builder.create<arith::AddIOp>(loc, (lhs), (rhs))) |
| #define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs))) |
| #define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs))) |
| #define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs))) |
| #define REMUI(lhs, rhs) (builder.create<arith::RemUIOp>(loc, (lhs), (rhs))) |
| #define DIVUI(lhs, rhs) (builder.create<arith::DivUIOp>(loc, (lhs), (rhs))) |
| #define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r))) |
| |
| //===----------------------------------------------------------------------===// |
| // Debugging utils |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef NDEBUG |
| LLVM_ATTRIBUTE_UNUSED static void dumpIndexMemRef(OpBuilder &builder, |
| Location loc, Value memref) { |
| memref = builder.create<memref::CastOp>( |
| loc, UnrankedMemRefType::get(builder.getIndexType(), 0), memref); |
| createFuncCall(builder, loc, "printMemrefInd", TypeRange{}, |
| ValueRange{memref}, EmitCInterface::On); |
| } |
| #endif |
| |
| //===----------------------------------------------------------------------===// |
| // File local helper functions. |
| //===----------------------------------------------------------------------===// |
| |
| // For index reduction loops, since the tensor are sliced into non-continuous |
| // fragments, we need a triple [pLo, pHi, pPtr], in which the pair (pLo, pHi) |
| // specifies the range of the fragment, and pPtr specifies the index of the |
| // corresponding fragment in the child level (i.e., a pointer to the sliced |
| // position array). |
| static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor, |
| Level lvl) { |
| auto enc = getSparseTensorEncoding(tensor.getType()); |
| return createOrFoldSliceOffsetOp(builder, loc, tensor, toDim(enc, lvl)); |
| } |
| |
| static Value genSliceStride(OpBuilder &builder, Location loc, Value tensor, |
| Level lvl) { |
| auto enc = getSparseTensorEncoding(tensor.getType()); |
| return createOrFoldSliceStrideOp(builder, loc, tensor, toDim(enc, lvl)); |
| } |
| |
| static bool isIntOrFPZero(Attribute attr) { |
| if (auto f = llvm::dyn_cast<FloatAttr>(attr); f && f.getValue().isZero()) |
| return true; |
| if (auto i = llvm::dyn_cast<IntegerAttr>(attr); i && i.getValue().isZero()) |
| return true; |
| return false; |
| } |
| |
| static Value unFoldOpIntResult(OpBuilder &builder, Location loc, |
| OpFoldResult ofr) { |
| if (std::optional<int64_t> i = getConstantIntValue(ofr); i.has_value()) |
| return constantIndex(builder, loc, *i); |
| return ofr.get<Value>(); |
| } |
| |
| static Value tryFoldTensors(Value t) { |
| // TODO: this should be done through a folding pass after switching to |
| // `sparse_tensor.iterate`-based sparsification. |
| auto stt = tryGetSparseTensorType(t); |
| auto padOp = t.getDefiningOp<tensor::PadOp>(); |
| if (padOp && stt.has_value() && stt->hasEncoding() && |
| padOp.getSourceType().getEncoding() == stt->getEncoding() && |
| stt->getEncoding().isIdentity()) { |
| // Try fusing padOp with zeros. |
| Attribute padCst; |
| if (matchPattern(padOp.getBody()->getTerminator(), |
| m_Op<tensor::YieldOp>(m_Constant(&padCst))) && |
| isIntOrFPZero(padCst)) { |
| return padOp.getSource(); |
| } |
| } |
| return t; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Sparse tensor loop emitter class implementations |
| //===----------------------------------------------------------------------===// |
| |
| LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput, |
| bool isSparseOut, unsigned numLoops, |
| DependentLvlGetter dimGetter, |
| SparseEmitStrategy emitStrategy) { |
| initialize(tensors, loopTag, hasOutput, isSparseOut, numLoops, dimGetter); |
| } |
| |
| void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput, |
| bool isSparseOut, unsigned numLoops, |
| DependentLvlGetter dimGetter, |
| SparseEmitStrategy emitStrategy) { |
| // First initialize the top-level type of the fields. |
| this->loopTag = loopTag; |
| this->hasOutput = hasOutput; |
| this->isSparseOut = isSparseOut; |
| this->emitStrategy = emitStrategy; |
| |
| const unsigned numManifestTensors = ts.size(); |
| const unsigned synTensorId = numManifestTensors; |
| const unsigned numTensors = numManifestTensors + 1; |
| // tensors array (len == numManifestTensor). |
| this->tensors.assign(ts.begin(), ts.end()); |
| // Arrays with len == numTensor. |
| this->valBuffer.assign(numTensors, nullptr); |
| this->lvls.resize(numTensors); |
| this->iters.resize(numTensors); |
| |
| // These zeros will be overwritten below, but we need to initialize |
| // them to something since we'll need random-access assignment. |
| this->loopStack.reserve(numLoops); |
| this->loopSeqStack.reserve(numLoops); |
| |
| // Index-reduction related fields. |
| this->dependentLvlMap.assign( |
| numTensors, std::vector<std::vector<std::pair<TensorLevel, unsigned>>>()); |
| this->sliceMeta.assign( |
| numTensors, std::vector<std::vector<std::pair<Value, unsigned>>>()); |
| this->levelReducedDep.assign(numTensors, std::vector<unsigned>()); |
| |
| // Initialize nested types of `TensorId`-indexed fields. |
| for (TensorId tid = 0; tid < numTensors; tid++) { |
| Level lvlRank; |
| if (tid == synTensorId) { |
| // Synthetic tensor (conceptually) is an all-dense tensor with rank equal |
| // to the total number of loops (each level can potentially be mapped to |
| // one of the loop being generated). |
| lvlRank = numLoops; |
| } else { |
| const Value t = tensors[tid]; |
| // a scalar or 0-dimension tensors |
| if (isZeroRankedTensorOrScalar(t.getType())) |
| continue; |
| |
| auto rtp = getRankedTensorType(t); |
| const SparseTensorType stt(rtp); |
| lvlRank = stt.getLvlRank(); |
| } |
| |
| lvls[tid].resize(lvlRank); |
| iters[tid].resize(lvlRank); |
| loopHighs.assign(numLoops, nullptr); |
| |
| // Slice-driven loops related initialization. |
| levelReducedDep[tid].assign(lvlRank, 0); |
| dependentLvlMap[tid].assign( |
| lvlRank, std::vector<std::pair<TensorLevel, unsigned>>()); |
| sliceMeta[tid].assign(lvlRank, std::vector<std::pair<Value, unsigned>>()); |
| if (dimGetter && !isSynTensor(tid)) { |
| for (Level l = 0; l < lvlRank; l++) { |
| std::vector<std::pair<LoopId, unsigned>> deps = dimGetter(tid, l); |
| // Sort the loop by order. |
| llvm::sort(deps, llvm::less_first()); |
| |
| dependentLvlMap[tid][l] = std::move(deps); |
| unsigned depends = dependentLvlMap[tid][l].size(); |
| if (depends == 0) |
| continue; |
| sliceMeta[tid][l].reserve(depends); |
| } |
| } |
| } |
| } |
| |
| std::unique_ptr<SparseIterator> |
| LoopEmitter::makeLevelIterator(OpBuilder &builder, Location loc, TensorId t, |
| Level l) { |
| Value tensor = tensors[t]; |
| auto stt = getSparseTensorType(tensor); |
| auto it = makeSimpleIterator(*lvls[t][l], emitStrategy); |
| |
| Value folded = tryFoldTensors(tensor); |
| if (folded != tensor) { |
| auto padOp = tensor.getDefiningOp<tensor::PadOp>(); |
| assert(padOp); |
| if (padOp.getPaddedDims().test(l)) { |
| Value low = unFoldOpIntResult(builder, loc, padOp.getMixedLowPad()[l]); |
| Value high = unFoldOpIntResult(builder, loc, padOp.getMixedHighPad()[l]); |
| auto padIt = makePaddedIterator(std::move(it), low, high, emitStrategy); |
| return padIt; |
| } |
| } |
| |
| if (stt.hasEncoding() && stt.getEncoding().isSlice()) { |
| Value offset = genSliceOffset(builder, loc, tensor, l); |
| Value stride = genSliceStride(builder, loc, tensor, l); |
| auto slicedIt = makeSlicedLevelIterator( |
| std::move(it), offset, stride, lvls[t][l]->getSize(), emitStrategy); |
| return slicedIt; |
| } |
| |
| return it; |
| } |
| |
| void LoopEmitter::initializeLoopEmit( |
| OpBuilder &builder, Location loc, LoopEmitter::OutputUpdater updater, |
| LoopEmitter::SynTensorBoundSetter synSetter) { |
| // For every synthetic tensor, set the high bound by calling the callback. |
| if (synSetter) { |
| TensorId synId = getSynTensorId(); |
| for (unsigned i = 0, e = loopHighs.size(); i < e; i++) { |
| Value sz = loopHighs[i] = synSetter(builder, loc, i); |
| auto [stl, it] = makeSynLevelAndIterator(sz, synId, i, emitStrategy); |
| lvls[synId][i] = std::move(stl); |
| iters[synId][i].emplace_back(std::move(it)); |
| } |
| } |
| |
| // For every manifest tensor: |
| // * get the values buffer. |
| // * For every level: |
| // * get the positions and coordinates buffers |
| // * get/compute the level-size, which is also used as the upper-bound |
| // on positions. |
| for (TensorId t = 0, numTensors = getNumManifestTensors(); t < numTensors; |
| t++) { |
| // TODO: this should be done through a folding pass after switching to |
| // `sparse_tensor.iterate`-based sparsification. |
| const Value tensor = tryFoldTensors(tensors[t]); |
| const auto rtp = dyn_cast<RankedTensorType>(tensor.getType()); |
| if (!rtp) |
| // Skips only scalar, zero ranked tensor still need to be bufferized and |
| // (probably) filled with zeros by users. |
| continue; |
| // FIXME: the definition of `lvlRank` looks more like a dim-rank; |
| // but the variable is used as a level everywhere below, which |
| // suggests there may be some dim/lvl confusion going on here. |
| auto stt = getSparseTensorType(tensor); |
| const Level lvlRank = stt.getLvlRank(); |
| const auto shape = rtp.getShape(); |
| |
| // Scan all levels of current tensor. |
| for (Level l = 0; l < lvlRank; l++) { |
| // Find upper bound in current dimension. |
| lvls[t][l] = makeSparseTensorLevel(builder, loc, tensor, t, l); |
| if (!dependentLvlMap[t][l].empty()) |
| continue; |
| |
| auto it = makeLevelIterator(builder, loc, t, l); |
| iters[t][l].emplace_back(std::move(it)); |
| } |
| |
| // Perform the required bufferization. Dense inputs materialize |
| // from the input tensors. Sparse inputs use sparse primitives to obtain the |
| // values. |
| // Delegates extra output initialization to clients. |
| bool isOutput = isOutputTensor(t); |
| Type elementType = stt.getElementType(); |
| if (!stt.hasEncoding()) { |
| // Non-annotated dense tensors. |
| BaseMemRefType denseTp = MemRefType::get(shape, elementType); |
| |
| // TODO: if we unconditionally use fully dynamic layout here, it breaks |
| // some vectorization passes which requires static stride = 1. |
| // Is it possible to call vectorization pass after bufferization? |
| if (llvm::isa_and_nonnull<tensor::ExtractSliceOp>(tensor.getDefiningOp())) |
| denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp); |
| |
| Value denseVal = |
| builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor); |
| // Dense outputs need special handling. |
| if (isOutput && updater) |
| denseVal = updater(builder, loc, denseVal, tensor); |
| |
| valBuffer[t] = denseVal; |
| } else { |
| // Annotated sparse tensors. |
| // We also need the value buffer for all-dense annotated "sparse" |
| // tensors. |
| valBuffer[t] = builder.create<ToValuesOp>(loc, tensor); |
| } |
| // NOTE: we can also prepare for 0 lvl here in advance, this will hoist |
| // some loop preparation from tensor iteration, but will also (undesirably) |
| // hoist the code ouside if-conditions. |
| } |
| // TODO: avoid treating subsection iterator as a special case. |
| initSubSectIterator(builder, loc); |
| } |
| |
| void LoopEmitter::initSubSectIterator(OpBuilder &builder, Location loc) { |
| Value c0 = C_IDX(0); |
| for (TensorId t = 0, e = tensors.size(); t < e; t++) { |
| auto rtp = dyn_cast<RankedTensorType>(tensors[t].getType()); |
| if (!rtp) |
| continue; |
| |
| Level lvlRank = SparseTensorType(rtp).getLvlRank(); |
| |
| // Compute the dependency reduction order. |
| auto remDepStack = dependentLvlMap; |
| std::vector<std::tuple<LoopId, TensorId, Level>> depRedOrder; |
| for (Level lvl = 0; lvl < lvlRank; lvl++) { |
| // Reverse queue into a stack. |
| std::reverse(remDepStack[t][lvl].begin(), remDepStack[t][lvl].end()); |
| for (auto [loop, coeff] : dependentLvlMap[t][lvl]) |
| depRedOrder.emplace_back(std::make_tuple(loop, t, lvl)); |
| } |
| |
| if (depRedOrder.empty()) |
| continue; |
| |
| std::sort(depRedOrder.begin(), depRedOrder.end(), |
| [](auto &l, auto &r) { return std::get<0>(l) < std::get<0>(r); }); |
| |
| SmallVector<SparseIterator *> lastIter(tensors.size(), nullptr); |
| for (auto [loop, t, lvl] : depRedOrder) { |
| std::pair<LoopId, unsigned> curDep = remDepStack[t][lvl].back(); |
| assert(curDep.first == loop); |
| remDepStack[t][lvl].pop_back(); |
| |
| auto lvlIt = makeLevelIterator(builder, loc, t, lvl); |
| const SparseIterator *parent = lastIter[t]; |
| if (!parent && lvl > 0) { |
| if (dependentLvlMap[t][lvl - 1].empty()) { |
| parent = iters[t][lvl - 1].back().get(); |
| } |
| } |
| |
| std::unique_ptr<SparseIterator> it; |
| if (!remDepStack[t][lvl].empty()) { |
| // Compute the subsection size. |
| Value size = c0; |
| for (auto [loop, stride] : remDepStack[t][lvl]) { |
| Value idxMax = SUBI(loopHighs[loop], C_IDX(1)); |
| size = ADDI(size, ADDI(MULI(idxMax, C_IDX(stride)), C_IDX(1))); |
| } |
| it = makeNonEmptySubSectIterator(builder, loc, parent, loopHighs[loop], |
| std::move(lvlIt), size, curDep.second, |
| emitStrategy); |
| } else { |
| const SparseIterator &subSectIter = *iters[t][lvl].back(); |
| it = makeTraverseSubSectIterator(builder, loc, subSectIter, *parent, |
| std::move(lvlIt), loopHighs[loop], |
| curDep.second, emitStrategy); |
| } |
| lastIter[t] = it.get(); |
| iters[t][lvl].emplace_back(std::move(it)); |
| } |
| } |
| } |
| |
| void LoopEmitter::categorizeIterators( |
| ArrayRef<TensorLevel> tidLvls, SmallVectorImpl<SparseIterator *> &raIters, |
| SmallVectorImpl<SparseIterator *> &spIters) { |
| // Finds out the tensor level that we should use to generate loops. Amongs all |
| // the tensor levels, there is at most one sparse tensor level. |
| for (auto [t, l] : unpackTensorLevelRange(tidLvls)) { |
| SparseIterator *it = &getCurIterator(t, l); |
| if (it->randomAccessible()) |
| raIters.push_back(it); |
| else |
| spIters.push_back(it); |
| } |
| |
| std::stable_sort(spIters.begin(), spIters.end(), [](auto lhs, auto rhs) { |
| // AffineUnRed > Affine > Slice > Trivial |
| return static_cast<uint8_t>(lhs->kind) > static_cast<uint8_t>(rhs->kind); |
| }); |
| } |
| |
| void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, |
| ArrayRef<TensorLevel> tidLvls) { |
| // TODO: sort |
| assert(loopSeqStack.size() == loopStack.size()); |
| // Prepares for all the tensors used in the current loop sequence. |
| |
| for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) { |
| levelReducedDep[tid][lvl]++; |
| prepareLoopOverTensorAtLvl(builder, loc, tid, lvl); |
| } |
| |
| // Universal Index starts from 0. |
| loopSeqStack.emplace_back(C_IDX(0), tidLvls.vec()); |
| } |
| |
| void LoopEmitter::exitCurrentLoopSeq(OpBuilder &builder, Location loc) { |
| assert(loopSeqStack.size() == loopStack.size() + 1); |
| |
| // Depending on whether the slice is resolved or not at current loop sequence, |
| // end them in different ways. |
| for (auto [tid, lvl] : unpackTensorLevelRange(loopSeqStack.back().second)) |
| levelReducedDep[tid][lvl]--; |
| |
| loopSeqStack.pop_back(); |
| } |
| |
| Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) { |
| switch (a.getKind()) { |
| case AffineExprKind::DimId: { |
| // FIXME: since the one callsite in Sparsification passes in a |
| // level-expression, the `getPosition` must in fact be a `Dimension`. |
| // However, elsewhere we have been lead to expect that `loopIdToOrd` |
| // should be indexed by `LoopId`... |
| const auto loopId = cast<AffineDimExpr>(a).getPosition(); |
| return loopStack[loopId].iv; |
| } |
| case AffineExprKind::Add: { |
| auto binOp = cast<AffineBinaryOpExpr>(a); |
| return ADDI(genAffine(builder, loc, binOp.getLHS()), |
| genAffine(builder, loc, binOp.getRHS())); |
| } |
| case AffineExprKind::Mul: { |
| auto binOp = cast<AffineBinaryOpExpr>(a); |
| return MULI(genAffine(builder, loc, binOp.getLHS()), |
| genAffine(builder, loc, binOp.getRHS())); |
| } |
| case AffineExprKind::Constant: { |
| int64_t c = cast<AffineConstantExpr>(a).getValue(); |
| return C_IDX(c); |
| } |
| default: |
| llvm_unreachable("unexpected affine subscript"); |
| } |
| } |
| |
| std::pair<Operation *, Value> LoopEmitter::emitForLoopOverTensorAtLvl( |
| OpBuilder &builder, Location loc, SparseIterator &iter, |
| MutableArrayRef<Value> reduc, bool isParallel) { |
| |
| // TODO: support dynamic slices. |
| // Uses the first dimension here to build the loop bound (which is also the |
| // biggest range). |
| |
| Value step = C_IDX(1); |
| auto [lo, hi] = iter.genForCond(builder, loc); |
| Operation *loop = nullptr; |
| Value iv; |
| if (isParallel) { |
| scf::ParallelOp parOp = |
| builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc); |
| builder.setInsertionPointToStart(parOp.getBody()); |
| assert(parOp.getNumReductions() == reduc.size()); |
| iv = parOp.getInductionVars()[0]; |
| |
| // In-place update on the reduction variable vector. |
| // Note that the init vals is not the actual reduction variables but instead |
| // used as a "special handle" to (temporarily) represent them. The |
| // expression on init vals will be moved into scf.reduce and replaced with |
| // the block arguments when exiting the loop (see exitForLoop). This is |
| // needed as we can not build the actual reduction block and get the actual |
| // reduction variable before users fill parallel loop body. |
| for (int i = 0, e = reduc.size(); i < e; i++) |
| reduc[i] = parOp.getInitVals()[i]; |
| loop = parOp; |
| } else { |
| scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc); |
| builder.setInsertionPointToStart(forOp.getBody()); |
| iv = forOp.getInductionVar(); |
| |
| // In-place update on the reduction variable vector. |
| assert(forOp.getNumRegionIterArgs() == reduc.size()); |
| for (int i = 0, e = reduc.size(); i < e; i++) |
| reduc[i] = forOp.getRegionIterArg(i); |
| loop = forOp; |
| } |
| assert(loop && iv); |
| |
| Value crd = iv; |
| if (!iter.randomAccessible()) { |
| iter.linkNewScope(iv); |
| crd = iter.deref(builder, loc); |
| } else { |
| iter.locate(builder, loc, iv); |
| } |
| |
| return {loop, crd}; |
| } |
| |
| std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls( |
| OpBuilder &builder, Location loc, ArrayRef<SparseIterator *> spIters, |
| MutableArrayRef<Value> reduc, bool needsUniv) { |
| // NOTE: the slice driven tensor-related reduction variable must |
| // appear before normal tensors. |
| |
| // The set of induction variables for the while loop. |
| SmallVector<Value> ivs; |
| |
| // Construct the while-loop with a parameter for each coordinate. |
| for (SparseIterator *it : spIters) { |
| ValueRange itVals = it->getCursor(); |
| ivs.append(itVals.begin(), itVals.end()); |
| } |
| |
| // The position where user-supplied reduction variable starts. |
| ivs.append(reduc.begin(), reduc.end()); |
| // Update universal index. |
| if (needsUniv) |
| ivs.push_back(loopSeqStack.back().first); |
| |
| // Ensures all operands are valid. |
| assert(llvm::all_of(ivs, [](Value v) { return v != nullptr; })); |
| TypeRange types = ValueRange(ivs).getTypes(); |
| auto whileOp = builder.create<scf::WhileOp>(loc, types, ivs); |
| |
| SmallVector<Location> locs(types.size(), loc); |
| Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); |
| Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); |
| |
| // Generates loop conditions. |
| builder.setInsertionPointToStart(before); |
| ValueRange bArgs = before->getArguments(); |
| Value whileCond = nullptr; // bool values for loop condition. |
| |
| for (SparseIterator *it : spIters) { |
| auto [cond, remArgs] = it->genWhileCond(builder, loc, bArgs); |
| whileCond = !whileCond ? cond : ANDI(whileCond, cond); |
| bArgs = remArgs; |
| } |
| // The remaining block arguments are user-provided reduction values and an |
| // optional universal index. Make sure their sizes match. |
| assert(bArgs.size() == reduc.size() + needsUniv ? 1 : 0); |
| builder.create<scf::ConditionOp>(loc, whileCond, before->getArguments()); |
| |
| // Generates loop body. |
| builder.setInsertionPointToStart(after); |
| ValueRange aArgs = after->getArguments(); |
| // Since some LoopCondKind might need extra checks to filter out invalid |
| // iterations, we maintains another array to hold the iteration arguments to |
| // yield if the checks fails. |
| SmallVector<Value> nextArgs(aArgs.begin(), aArgs.end()); |
| |
| for (SparseIterator *it : spIters) { |
| aArgs = it->linkNewScope(aArgs); |
| // Dereference the iterator to cache the coordinate. |
| it->deref(builder, loc); |
| } |
| |
| // In-place update on reduction variable. |
| assert(aArgs.size() == reduc.size() + needsUniv ? 1 : 0); |
| for (unsigned i = 0, e = reduc.size(); i < e; i++) |
| reduc[i] = aArgs[i]; |
| |
| Value min; |
| // Finds the minimum coordinate |
| if (!needsUniv) { |
| for (SparseIterator *it : spIters) { |
| if (min) { |
| Value cmp = CMPI(ult, it->getCrd(), min); |
| min = SELECT(cmp, it->getCrd(), min); |
| } else { |
| min = it->getCrd(); |
| } |
| } |
| } else { |
| // Otherwise, universal index is the minimal pos. |
| min = whileOp.getAfterArguments().back(); |
| } |
| |
| return {whileOp, min}; |
| } |
| |
| bool LoopEmitter::shouldIteratedByForLoop(ArrayRef<SparseIterator *> spIters) { |
| // If we need to co-iterate over two sparse tensors, we need a while loop |
| if (spIters.size() > 1) |
| return false; |
| |
| if (spIters.size() == 1) |
| return spIters.front()->iteratableByFor(); |
| |
| return true; |
| } |
| |
| Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls( |
| OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls, |
| MutableArrayRef<Value> reduc, bool tryParallel, bool needsUniv) { |
| |
| // TODO: support multiple return on parallel for? |
| tryParallel = tryParallel && reduc.size() <= 1; |
| |
| SmallVector<SparseIterator *> raIters; |
| SmallVector<SparseIterator *> spIters; |
| categorizeIterators(tidLvls, raIters, spIters); |
| |
| // Only when there is at least one sparse conditions, do we really need the |
| // universal index. |
| // TODO: Maybe we should instead requires merger to pass in a valid value at |
| // the first place instead of adjusting it in LoopEmitter? |
| needsUniv = !spIters.empty() && needsUniv; |
| // The TensorLevel used for loop conditions. |
| // If there is any sparse level, we need to use the sparse condition. |
| // If all levels are dense, we can pick arbitrary one (dense slice-driven loop |
| // can be generated using a simple ForOp as well). |
| Operation *l = nullptr; |
| Value iv = nullptr; |
| SmallVector<TensorLevel> tls; |
| |
| // Generates loops differently depending on whether we need a slice-driven |
| // loop or a simple level traversal loop. |
| if (shouldIteratedByForLoop(spIters) && !needsUniv) { |
| assert(spIters.size() <= 1); |
| SparseIterator &it = spIters.empty() ? *raIters.front() : *spIters.front(); |
| std::tie(l, iv) = |
| emitForLoopOverTensorAtLvl(builder, loc, it, reduc, tryParallel); |
| tls.push_back(makeTensorLevel(it.tid, it.lvl)); |
| } else { |
| for (auto *it : spIters) { |
| tls.push_back(makeTensorLevel(it->tid, it->lvl)); |
| } |
| |
| if (needsUniv) |
| for (auto *it : raIters) |
| tls.push_back(makeTensorLevel(it->tid, it->lvl)); |
| |
| std::tie(l, iv) = |
| emitWhileLoopOverTensorsAtLvls(builder, loc, spIters, reduc, needsUniv); |
| } |
| |
| // Enter dense tensor levels. |
| for (SparseIterator *it : raIters) |
| it->locate(builder, loc, iv); |
| |
| // NOTE: we can also prepare for next dim here in advance |
| // Pushes the loop into stack. |
| loopStack.emplace_back(tls, l, builder.getInsertionBlock(), iv, loopTag); |
| return l; |
| } |
| |
| void LoopEmitter::locateLvlAtAffineAddress(OpBuilder &builder, Location loc, |
| TensorLevel tidLvl, |
| AffineExpr lvlExpr) { |
| auto [tid, lvl] = unpackTensorLevel(tidLvl); |
| |
| const SparseIterator *parent = |
| lvl == 0 ? nullptr : iters[tid][lvl - 1].back().get(); |
| auto &it = getCurIterator(tid, lvl); |
| it.genInit(builder, loc, parent); |
| |
| assert(it.kind == IterKind::kTrivial && it.randomAccessible()); |
| Value lvlCrd = genAffine(builder, loc, lvlExpr); |
| it.locate(builder, loc, lvlCrd); |
| } |
| |
| void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc, |
| TensorId tid, Level lvl) { |
| // if this is the first level, there is no parent iterator for the current |
| // iterator. |
| // If the current iterator is a subsection-based iterator, the parent iterator |
| // is memorized by the iterator. |
| bool hasParent = lvl == 0 || !dependentLvlMap[tid][lvl].empty(); |
| |
| const SparseIterator *parent = |
| hasParent ? nullptr : iters[tid][lvl - 1].back().get(); |
| auto &it = getCurIterator(tid, lvl); |
| it.genInit(builder, loc, parent); |
| |
| // Locates the randon accessible iterator to 0. |
| if (it.randomAccessible()) |
| it.locate(builder, loc, C_IDX(0)); |
| } |
| |
| void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, |
| MutableArrayRef<Value> reduc) { |
| const LoopInfo &loopInfo = loopStack.back(); |
| if (auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop)) { |
| if (!reduc.empty()) { |
| assert(reduc.size() == forOp.getNumResults()); |
| rewriter.create<scf::YieldOp>(loc, reduc); |
| } |
| // Exit the loop. |
| rewriter.setInsertionPointAfter(forOp); |
| // In-place update reduction variables. |
| for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++) |
| reduc[i] = forOp.getResult(i); |
| } else { |
| auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop); |
| if (!reduc.empty()) { |
| assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1); |
| Operation *redExp = reduc.front().getDefiningOp(); |
| // Reduction expression should have no use. |
| assert(redExp->getUses().empty()); |
| // This must be a binary operation. |
| // NOTE: This is users' responsibility to ensure the operation are |
| // commutative. |
| assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1); |
| |
| Value redVal = parOp.getInitVals().front(); |
| Value curVal; |
| if (redExp->getOperand(0) == redVal) |
| curVal = redExp->getOperand(1); |
| else if (redExp->getOperand(1) == redVal) |
| curVal = redExp->getOperand(0); |
| // One of the operands must be the init value (which is also the |
| // previous reduction value). |
| assert(curVal); |
| #ifndef NDEBUG |
| // The reduction expression should be the only user of the reduction val |
| // inside the parallel for. |
| unsigned numUsers = 0; |
| for (Operation *op : redVal.getUsers()) { |
| if (op->getParentOp() == parOp) |
| numUsers++; |
| } |
| assert(numUsers == 1); |
| #endif // NDEBUG |
| |
| rewriter.setInsertionPointAfter(redExp); |
| auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal); |
| // Attach to the reduction op. |
| Block *redBlock = &redOp.getReductions().front().front(); |
| rewriter.setInsertionPointToEnd(redBlock); |
| Operation *newRed = rewriter.clone(*redExp); |
| // Replaces arguments of the reduction expression by using the block |
| // arguments from scf.reduce. |
| rewriter.modifyOpInPlace( |
| newRed, [&]() { newRed->setOperands(redBlock->getArguments()); }); |
| // Erases the out-dated reduction expression. |
| rewriter.eraseOp(redExp); |
| rewriter.setInsertionPointToEnd(redBlock); |
| rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0)); |
| } |
| rewriter.setInsertionPointAfter(parOp); |
| // In-place update reduction variables. |
| for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++) |
| reduc[i] = parOp.getResult(i); |
| } |
| } |
| |
| void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc, |
| MutableArrayRef<Value> reduc) { |
| const LoopInfo &loopInfo = loopStack.back(); |
| auto whileOp = llvm::cast<scf::WhileOp>(loopInfo.loop); |
| Value iv = loopInfo.iv; |
| Value one = C_IDX(1); |
| |
| // Finalize the induction. Note that the induction could be performed |
| // in the individual if-branches to avoid re-evaluating the conditions. |
| // However, that would result in a rather elaborate forest of yield |
| // instructions during code generation. Moreover, performing the induction |
| // after the if-statements more closely resembles code generated by TACO. |
| SmallVector<Value> operands; |
| ValueRange whileRes = whileOp.getResults(); |
| |
| for (auto [tid, lvl] : unpackTensorLevelRange(loopInfo.tidLvls)) { |
| SparseIterator &it = getCurIterator(tid, lvl); |
| if (!it.randomAccessible()) { |
| // Forward the sparse iterator. |
| Value cmp = CMPI(eq, it.getCrd(), iv); |
| it.forwardIf(builder, loc, cmp); |
| operands.append(it.getCursor().begin(), it.getCursor().end()); |
| // const Value newPos = whileOp->getResult(o++); |
| // Following loops continue iteration from the break point of the |
| // current while loop. |
| whileRes = it.linkNewScope(whileRes); |
| } else { |
| // Make sure randomly accessible (dense) iterator is set to the right |
| // position according to the universal index. |
| Value uniIdx = whileOp.getResults().back(); |
| it.locate(builder, loc, uniIdx); |
| } |
| } |
| |
| // Reduction value from users. |
| for (auto &i : reduc) { |
| operands.push_back(i); |
| // Update user reduction variables. |
| i = whileRes.front(); |
| whileRes = whileRes.drop_front(); |
| } |
| |
| // An (optional) universal index. |
| if (operands.size() < whileOp.getNumResults()) { |
| assert(operands.size() + 1 == whileOp.getNumResults()); |
| // The last one is the universial index. |
| operands.push_back(ADDI(iv, one)); |
| // update the loop starting point of current loop sequence |
| loopSeqStack.back().first = whileOp->getResults().back(); |
| } |
| |
| if (!operands.empty()) |
| YIELD(operands); |
| |
| builder.setInsertionPointAfter(whileOp); |
| } |
| |
| void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc, |
| MutableArrayRef<Value> reduc) { |
| // Clean up the values, it would help use to discover potential bug at a |
| // earlier stage (instead of silently using a wrong value). |
| const LoopInfo &loopInfo = loopStack.back(); |
| |
| // Sets the insertion point to the right position. |
| rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock); |
| if (!loopInfo.userCodeBlock->empty() && |
| llvm::isa<scf::YieldOp>(&loopInfo.userCodeBlock->back())) { |
| // scf::While/For inserts an implicit yield op when there is no loop |
| // iter args. In this case, we need to insert the code before the yield. |
| assert(loopInfo.userCodeBlock->back().getNumResults() == 0); |
| rewriter.setInsertionPoint(&loopInfo.userCodeBlock->back()); |
| } |
| |
| if (llvm::isa<scf::WhileOp>(loopInfo.loop)) { |
| exitWhileLoop(rewriter, loc, reduc); |
| } else { |
| exitForLoop(rewriter, loc, reduc); |
| } |
| |
| assert(loopStack.size() == loopSeqStack.size()); |
| loopStack.pop_back(); |
| } |
| |
| #undef CMPI |
| #undef C_IDX |
| #undef YIELD |
| #undef ADDI |
| #undef ANDI |
| #undef SUBI |
| #undef MULI |
| #undef SELECT |