| //===- LoopEmitter.h --------------------------------------------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_UTILS_LOOPEMITTER_H_ |
| #define MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_UTILS_LOOPEMITTER_H_ |
| |
| #include <vector> |
| |
| #include "SparseTensorIterator.h" |
| |
| #include "mlir/Dialect/SparseTensor/IR/Enums.h" |
| #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h" |
| #include "mlir/Dialect/SparseTensor/Transforms/Passes.h" |
| #include "mlir/Dialect/SparseTensor/Utils/Merger.h" |
| #include "mlir/IR/PatternMatch.h" |
| |
| namespace mlir { |
| namespace sparse_tensor { |
| |
| // A compressed <tensor id, level> pair. |
| using TensorLevel = unsigned; |
| |
| // |
| // SparseTensorLoopEmiter class, manages sparse tensors and helps to |
| // generate loop structure to (co)-iterate sparse tensors. |
| // |
| // An example usage: |
| // To generate the following loops over T1<?x?> and T2<?x?> |
| // |
| // for i in TENSOR_1_0 { |
| // for j : TENSOR_2_0 { |
| // for k : TENSOR_1_1 {} |
| // for k : TENSOR_2_1 {} |
| // } |
| // } |
| // |
| // One can use |
| // |
| // LoopEmiter loopEmiter({T1, T1}); |
| // loopEmiter.initializeLoopEmit(); |
| // loopEmiter.enterLoopOverTensorAtLvl(T1, 0); |
| // loopEmiter.enterLoopOverTensorAtLvl(T2, 0); |
| // loopEmiter.enterLoopOverTensorAtLvl(T1, 1); |
| // loopEmiter.exitCurrentLoop(); |
| // loopEmiter.enterLoopOverTensorAtLvl(T2, 1); |
| // loopEmiter.exitCurrentLoop(); // exit k |
| // loopEmiter.exitCurrentLoop(); // exit j |
| // loopEmiter.exitCurrentLoop(); // exit i |
| // |
| class LoopEmitter { |
| public: |
| /// Optional callback function to setup dense output tensors when |
| /// initializing the loop emitter (e.g., to fill a dense output with zeros). |
| using OutputUpdater = function_ref<Value(OpBuilder &builder, Location loc, |
| Value memref, Value tensor)>; |
| |
| /// Optional callback function to set the bound for the synthetic tensor, |
| /// which essentially is the dense loop bound. |
| using SynTensorBoundSetter = |
| function_ref<Value(OpBuilder &builder, Location loc, Level lvl)>; |
| |
| // Map from [tid, lvl] to a list of dependent [LoopId, coeffecient] for |
| // subscript expressions on sparse tensors. |
| // |
| // E.g., for affine index (2 * d0 + d1), it depends on loop d0 and d1 (for |
| // affine expression reduction) and uses 2 and 1 for coefficients on d0, d1 |
| // respectively. If the list is empty, it means that there is no affine |
| // expression on the input [tid, lvl]. |
| // |
| // NOTE: LoopEmitter assumes that the loop id is consistent with the loop |
| // order, i.e., loop `d0` will be generated before loop `d1`. |
| using DependentLvlGetter = |
| function_ref<std::vector<std::pair<LoopId, unsigned>>(TensorId, Level)>; |
| |
| LoopEmitter() = default; |
| |
| /// Takes an array of input tensors, which the generated loops will |
| /// iterate over. Each tensor is given a `TensorId` (numerically equal |
| /// to the position of that tensor `Value` in the array). Setting |
| /// `isSparseOut` indicates that the sparse output tensor is empty, |
| /// so the loop emitter will generate loops over it according to the |
| /// level-sizes. |
| void |
| initialize(ValueRange tensors, StringAttr loopTag = nullptr, |
| bool hasOutput = false, bool isSparseOut = false, |
| unsigned numLoops = 0, DependentLvlGetter getter = nullptr, |
| SparseEmitStrategy emitStrategy = SparseEmitStrategy::kFunctional); |
| |
| explicit LoopEmitter( |
| ValueRange tensors, StringAttr loopTag = nullptr, bool hasOutput = false, |
| bool isSparseOut = false, unsigned numLoops = 0, |
| DependentLvlGetter getter = nullptr, |
| SparseEmitStrategy emitStrategy = SparseEmitStrategy::kFunctional); |
| |
| /// Starts a loop emitting session by generating all the buffers needed |
| /// for iterating over the tensors. |
| void initializeLoopEmit(OpBuilder &builder, Location loc, |
| OutputUpdater updater = nullptr, |
| SynTensorBoundSetter synSetter = nullptr); |
| |
| /// Generates code to compute an affine expression whose variables are |
| /// `LoopId`s (i.e., `a.cast<AffineDimExpr>().getPosition()` is a valid |
| /// `LoopId`). |
| Value genAffine(OpBuilder &builder, Location loc, AffineExpr a); |
| |
| /// Enters a new loop sequence, the loops within the same sequence starts |
| /// from the break points of previous loop instead of starting over from 0. |
| /// e.g., |
| /// { |
| /// // loop sequence start. |
| /// p0 = while(xxx) |
| /// ... |
| /// break p0 |
| /// |
| /// // Starts loop from p0 |
| /// for (i = p0; i < end; i++) |
| /// ... |
| /// // loop sequence end. |
| /// } |
| void enterNewLoopSeq(OpBuilder &builder, Location loc, |
| ArrayRef<TensorLevel> tidLvls); |
| |
| /// Exits the current loop sequence, this will reset universal index to 0. |
| void exitCurrentLoopSeq(OpBuilder &builder, Location loc); |
| |
| /// Emits the address for a dense level based on the value evaluated by the |
| /// provided affine expression. |
| void locateLvlAtAffineAddress(OpBuilder &builder, Location loc, |
| TensorLevel tidLvl, AffineExpr lvlExpr); |
| |
| // TODO: Get rid of `lvls` in the argument list? Track the level we |
| // are currently at internally. Then it would be enterNextLvlForTensor. |
| // Still need a way to specify the lvl for non-annotated tensors though, |
| // as those can be accessed out of order. |
| // |
| /// Emits a co-iteration loop over a set of tensors. |
| /// Emits loop over tensor_tid_lvl, it assumes that loops between |
| /// tensor_tid_[0, lvl - 1] have already been generated. |
| /// The function will also perform in-place update on the `reduc` vector to |
| /// return the reduction variable used inside the generated loop. |
| Operation *enterCoIterationOverTensorsAtLvls( |
| OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls, |
| MutableArrayRef<Value> reduc = {}, bool isParallel = false, |
| bool needsUniv = false); |
| |
| /// Generates code to exit the current loop (e.g., generates yields, forwards |
| /// loop induction variables, etc). |
| void exitCurrentLoop(RewriterBase &rewriter, Location loc, |
| MutableArrayRef<Value> reduc = {}); |
| |
| /// Get the range of values for all induction variables. |
| auto getLoopIVsRange() const { |
| return llvm::map_range(loopStack, [](const LoopInfo &li) { return li.iv; }); |
| } |
| |
| /// Fills the out-parameter with the loop induction variables for all |
| /// loops in the current loop-stack. |
| SmallVector<Value> getLoopIVs() const { |
| return llvm::to_vector(getLoopIVsRange()); |
| } |
| |
| /// Gets the current depth of the loop-stack. |
| LoopId getCurrentDepth() const { return llvm::range_size(getLoopIVsRange()); } |
| |
| /// Gets loop induction variable for the given loop |
| Value getLoopIV(LoopId n) const { |
| if (n >= getCurrentDepth()) |
| return Value(); |
| auto it = getLoopIVsRange().begin(); |
| std::advance(it, n); |
| return *it; |
| } |
| |
| /// Gets the total number of manifest tensors (excluding the synthetic |
| /// tensor). |
| unsigned getNumManifestTensors() const { return tensors.size(); } |
| |
| /// Gets the total number of tensors that loopEmitter is operating on. |
| unsigned getNumTensors() const { |
| // Manifest tensors with one synthetic tensor at the end. |
| return getNumManifestTensors() + 1; |
| } |
| |
| /// Gets the TensorId for synthetic tensor. |
| TensorId getSynTensorId() const { return tensors.size(); } |
| |
| /// Gets the TensorId for output tensor. |
| TensorId getOutTensorId() const { |
| assert(hasOutput); |
| return getNumManifestTensors() - 1; |
| } |
| |
| /// Compresses a TensorId and Level into a TensorLevel. |
| TensorLevel makeTensorLevel(TensorId t, Level l) const { |
| return l * getNumTensors() + t; |
| } |
| |
| /// De-compresses a TensorLevel back to a pair of TensorId and Level. |
| std::pair<TensorId, Level> unpackTensorLevel(TensorLevel tidLvl) const { |
| unsigned nt = getNumTensors(); |
| return std::make_pair(tidLvl % nt, tidLvl / nt); |
| } |
| |
| /// Converts a range of TensorLevel to a range of std::pair<TensorId, Level> |
| template <class ContainerTy> |
| auto unpackTensorLevelRange(ContainerTy &&c) const { |
| using EltTy = decltype(*c.begin()); |
| static_assert(std::is_same_v<llvm::remove_cvref_t<EltTy>, TensorLevel>, |
| "Must be unpacking a TensorLevel range"); |
| return llvm::map_range(std::forward<ContainerTy>(c), [this](EltTy tl) { |
| return this->unpackTensorLevel(tl); |
| }); |
| } |
| |
| /// |
| /// Getters. |
| /// |
| SmallVector<Value> getValPosits(TensorId tid) const { |
| SmallVector<Value> batchCrds = iters[tid].back().back()->getBatchCrds(); |
| Value lastLvlPos = iters[tid].back().back()->getCurPosition().first; |
| batchCrds.push_back(lastLvlPos); |
| return batchCrds; |
| }; |
| Value getCoord(TensorId tid, Level lvl) const { |
| return getCurIterator(tid, lvl).getCrd(); |
| }; |
| const std::vector<Value> &getValBuffer() const { return valBuffer; }; |
| |
| constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName() { |
| return llvm::StringLiteral("Emitted from"); |
| } |
| |
| private: |
| /// |
| /// Structure definitions that hold different kinds of loops information. |
| /// |
| |
| // LoopInfo stores information of a loop generated by LoopEmitter. E.g., |
| // the set of tensors levels that the loop is iterating over. |
| struct LoopInfo final { |
| LoopInfo(ArrayRef<TensorLevel> tidLvls, Operation *loop, Block *userBlock, |
| Value iv, StringAttr loopTag) |
| : tidLvls(tidLvls), loop(loop), userCodeBlock(userBlock), iv(iv) { |
| // Attached a special tag to loop emitter generated loop. |
| if (loopTag) |
| loop->setAttr(LoopEmitter::getLoopEmitterLoopAttrName(), loopTag); |
| } |
| // The set of <tensor, lvl>, with *only* trivial index expressions, that are |
| // used as the condition for the generated loop. Extra information is |
| // required for levels with non-tivial index expressions, which is |
| // maintained by the sliceDrivenInfo array below. |
| const llvm::SmallVector<TensorLevel> tidLvls; |
| const Operation *loop; // the loop operation |
| Block *const userCodeBlock; // the block holding users' generated code. |
| const Value iv; // the induction variable for the loop |
| }; |
| |
| void categorizeIterators(ArrayRef<TensorLevel> tidLvls, |
| SmallVectorImpl<SparseIterator *> &raIters, |
| SmallVectorImpl<SparseIterator *> &spIters); |
| /// |
| /// LoopEmitter internal helper functions. |
| /// |
| |
| using LoopBodyBuilder = llvm::function_ref<void(OpBuilder &, Location, Value, |
| MutableArrayRef<Value>)>; |
| |
| /// Whether the list of the sparse condition should be iterated by for loop. |
| bool shouldIteratedByForLoop(ArrayRef<SparseIterator *> spIters); |
| |
| /// Generates instructions to compute the coordinate of tensors[tid][lvl] |
| /// under the current loop context. The final argument is the |
| /// collapsed-output level, whereas this function handles converting |
| /// that to the uncollapsed-input level |
| Value genSparseCrd(OpBuilder &builder, Location loc, TensorId tid, |
| Level dstLvl); |
| |
| bool isSynTensor(TensorId tid) const { return tid == getSynTensorId(); } |
| |
| bool isOutputTensor(TensorId tid) const { |
| return hasOutput && tid == getOutTensorId(); |
| } |
| |
| bool isSparseOutput(TensorId tid) const { |
| return isOutputTensor(tid) && isSparseOut; |
| } |
| |
| bool isValidLevel(TensorId tid, Level lvl) const { |
| return tid < lvls.size() && lvl < lvls[tid].size(); |
| } |
| |
| /// Prepares loop for iterating over `tensor[lvl]`, under the assumption |
| /// that `tensor[0...lvl-1]` loops have already been set up. |
| void prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc, |
| TensorId tid, Level lvl); |
| |
| /// Emits a for loop to iterate over a tensor level with the provided |
| /// lower bound `lo` and upper bound `hi`. Apart from iterating just |
| /// single tensor level, for loops can be used for slice-driven loop on |
| /// dense level too. |
| /// Returns a pair: the loop generated and the value for the induction |
| /// variable. |
| std::pair<Operation *, Value> |
| emitForLoopOverTensorAtLvl(OpBuilder &builder, Location loc, |
| SparseIterator &iter, MutableArrayRef<Value> reduc, |
| bool isParallel); |
| |
| /// Emits a while loop to co-iterate over a list of sparse condition, or |
| /// (complex) single sparse condition that can not be handled by for loop |
| /// (e.g., index reduction loop). |
| /// Returns a pair: the loop generated and the value for the induction |
| /// variable (which is the minimum coordinate of all the tensor that being |
| /// iterated). |
| std::pair<Operation *, Value> |
| emitWhileLoopOverTensorsAtLvls(OpBuilder &builder, Location loc, |
| ArrayRef<SparseIterator *> iters, |
| MutableArrayRef<Value> reduc, bool needsUniv); |
| |
| /// Exits a for loop, returns the reduction results, e.g., |
| /// For sequential for loops: |
| /// %ret = for () { |
| /// ... |
| /// %val = addi %args, %c |
| /// yield %val |
| /// } |
| /// For parallel loops, the following generated code by users: |
| /// %ret = parallel () init(%args) { |
| /// ... |
| /// %val = op %args, %c |
| /// } |
| /// will be transformed into |
| /// %ret = parallel () init(%args) { |
| /// ... |
| /// scf.reduce(%c) bb0(%0, %1){ |
| /// %val = op %0, %1 |
| /// scf.reduce.return %val |
| /// } |
| /// } |
| /// NOTE: only one instruction will be moved into reduce block, |
| /// transformation will fail if multiple instructions are used to compute |
| /// the reduction value. Return %ret to user, while %val is provided by |
| /// users (`reduc`). |
| void exitForLoop(RewriterBase &rewriter, Location loc, |
| MutableArrayRef<Value> reduc); |
| |
| /// Exits a while loop, returns the reduction results. |
| void exitWhileLoop(OpBuilder &builder, Location loc, |
| MutableArrayRef<Value> reduc); |
| |
| // |
| // Slice-driven loop related methods. |
| // |
| |
| void initSubSectIterator(OpBuilder &builder, Location loc); |
| |
| /// Get the reduced number of contraints on tensor[tid][lvl]. |
| unsigned redDepOnLevel(TensorId tid, Level lvl) const { |
| return levelReducedDep[tid][lvl]; |
| }; |
| |
| SparseIterator &getCurIterator(TensorId tid, Level lvl) const { |
| if (dependentLvlMap[tid][lvl].empty()) |
| return *iters[tid][lvl].back(); |
| |
| assert(redDepOnLevel(tid, lvl) >= 1); |
| return *iters[tid][lvl][redDepOnLevel(tid, lvl) - 1]; |
| } |
| |
| std::unique_ptr<SparseIterator> |
| makeLevelIterator(OpBuilder &builder, Location loc, TensorId tid, Level l); |
| |
| /// A optional string attribute that should be attached to the loop |
| /// generated by loop emitter, it might help following passes to identify |
| /// loops that operates on sparse tensors more easily. |
| StringAttr loopTag; |
| /// Whether the loop emitter needs to treat the last tensor as the output |
| /// tensor. |
| bool hasOutput; |
| bool isSparseOut; |
| SparseEmitStrategy emitStrategy; |
| |
| // |
| // Fields which have `numTensor` many entries. |
| // |
| |
| /// Input and (optional) output tensors. |
| std::vector<Value> tensors; |
| std::vector<Value> loopHighs; |
| std::vector<std::vector<std::unique_ptr<SparseTensorLevel>>> lvls; |
| std::vector<std::vector<std::vector<std::unique_ptr<SparseIterator>>>> iters; |
| std::vector<Value> valBuffer; // to_value |
| |
| // Map from [tid, level] to a list of dependent [tidlevel, coefficient]. |
| // See comments for `DependentLvlGetter`. |
| std::vector<std::vector<std::vector<std::pair<LoopId, unsigned>>>> |
| dependentLvlMap; |
| |
| // The (size, stride) for each conceptual slice used for index reduction |
| // loops. |
| std::vector<std::vector<std::vector<std::pair<Value, unsigned>>>> sliceMeta; |
| |
| // The number of reduced dependencies on a tensor level so far. |
| std::vector<std::vector<unsigned>> levelReducedDep; |
| |
| // |
| // Fields which have at most `numLoops` many entries. |
| // |
| |
| /// Loop Stack, stores the information of all the nested loops that are |
| /// alive. |
| std::vector<LoopInfo> loopStack; |
| |
| // Loop Sequence Stack, stores the universal index for the current loop |
| // sequence. and a list of tid level that the loop sequence traverse. |
| std::vector<std::pair<Value, std::vector<TensorLevel>>> loopSeqStack; |
| }; |
| |
| } // namespace sparse_tensor |
| } // namespace mlir |
| |
| #endif // MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_UTILS_LOOPEMITTER_H_ |