| //===- TilingInterfaceImpl.cpp - Implementation of TilingInterface -------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h" |
| |
| #include "mlir/Analysis/SliceAnalysis.h" |
| #include "mlir/Dialect/Affine/IR/AffineOps.h" |
| #include "mlir/Dialect/Affine/Utils.h" |
| #include "mlir/Dialect/Arith/IR/Arith.h" |
| #include "mlir/Dialect/Arith/Utils/Utils.h" |
| #include "mlir/Dialect/Linalg/IR/Linalg.h" |
| #include "mlir/Dialect/Linalg/Utils/Utils.h" |
| #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| #include "mlir/Dialect/Tensor/IR/Tensor.h" |
| #include "mlir/Dialect/Utils/IndexingUtils.h" |
| #include "mlir/Dialect/Utils/StaticValueUtils.h" |
| #include "mlir/Dialect/Utils/StructuredOpsUtils.h" |
| #include "mlir/IR/BuiltinTypeInterfaces.h" |
| #include "mlir/Interfaces/TilingInterface.h" |
| #include "mlir/Interfaces/ValueBoundsOpInterface.h" |
| #include "llvm/Support/Debug.h" |
| #include <optional> |
| |
| #define DEBUG_TYPE "linalg-tiling-interface-impl" |
| |
| using namespace mlir; |
| using namespace mlir::linalg; |
| |
| //===----------------------------------------------------------------------===// |
| // Utility methods for implementation of Tiling Interface for Linalg ops |
| //===----------------------------------------------------------------------===// |
| |
| /// Return the SSA values that represent the data point accessed using a given |
| /// `indexingMap` for a given point in the iteration space represented by `ivs`. |
| static SmallVector<Value> getIndicesForAccess(OpBuilder &b, Location loc, |
| AffineMap indexingMap, |
| ValueRange ivs) { |
| SmallVector<Value> indices; |
| indices.reserve(indexingMap.getNumResults()); |
| for (auto result : indexingMap.getResults()) { |
| AffineMap m = AffineMap::get(indexingMap.getNumDims(), |
| indexingMap.getNumSymbols(), result); |
| Value v = affine::AffineApplyOp::create(b, loc, m, ivs); |
| indices.push_back(v); |
| } |
| return indices; |
| } |
| |
| /// Method to inline the payload of a `linalgOp` given the iteration space |
| /// point and values for the arguments of the payload. |
| static LogicalResult inlinePayload(OpBuilder &b, LinalgOp linalgOp, |
| ValueRange ivs, ValueRange argValues) { |
| Block *body = linalgOp.getBlock(); |
| IRMapping map; |
| map.map(body->getArguments(), argValues); |
| for (auto &op : body->without_terminator()) { |
| if (auto indexOp = dyn_cast<IndexOp>(&op)) { |
| map.map(indexOp.getResult(), ivs[indexOp.getDim()]); |
| continue; |
| } |
| b.clone(op, map); |
| } |
| |
| Operation *terminator = body->getTerminator(); |
| Location loc = terminator->getLoc(); |
| for (const auto &operand : llvm::enumerate(terminator->getOperands())) { |
| Value toStore = map.lookupOrDefault(operand.value()); |
| OpOperand *storeInto = linalgOp.getDpsInitOperand(operand.index()); |
| auto indices = getIndicesForAccess( |
| b, loc, linalgOp.getMatchingIndexingMap(storeInto), ivs); |
| memref::StoreOp::create(b, loc, toStore, |
| linalgOp.getDpsInitOperand(operand.index())->get(), |
| indices); |
| } |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // External Model for implementing `TilingInterface` for `LinalgOp`s. |
| //===----------------------------------------------------------------------===// |
| |
| namespace { |
| /// External model implementation of TilingInterface for LinalgOps. An external |
| /// model implementation is used for now till the use of `TilingInterface` is |
| /// on-par with the current Linalg tiling + fusion patterns. Once it is |
| /// maybe possible to move this into the op-definition (though there are |
| /// advantages to leaving it as an external model) |
| template <typename LinalgOpTy> |
| struct LinalgOpTilingInterface |
| : public TilingInterface::ExternalModel<LinalgOpTilingInterface<LinalgOpTy>, |
| LinalgOpTy> { |
| /// Return the loop iterator type. |
| SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const { |
| LinalgOpTy concreteOp = cast<LinalgOpTy>(op); |
| return concreteOp.getIteratorTypesArray(); |
| } |
| |
| /// Return the iteration domain range. |
| SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const { |
| OpBuilder::InsertionGuard g(b); |
| b.setInsertionPoint(op); |
| Location loc = op->getLoc(); |
| LinalgOp linalgOp = cast<LinalgOp>(op); |
| SmallVector<OpFoldResult> allShapesSizes = |
| linalgOp.createFlatListOfOperandDims(b, loc); |
| AffineMap map = linalgOp.getShapesToLoopsMap(); |
| |
| return llvm::to_vector( |
| llvm::map_range(map.getResults(), [&](AffineExpr loopExpr) { |
| OpFoldResult ofr = affine::makeComposedFoldedAffineApply( |
| b, loc, loopExpr, allShapesSizes); |
| return Range{b.getIndexAttr(0), ofr, b.getIndexAttr(1)}; |
| })); |
| } |
| |
| /// Instantiate the tiled implementation of the operation. |
| FailureOr<TilingResult> |
| getTiledImplementation(Operation *op, OpBuilder &b, |
| ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes) const { |
| // Leave the `sizeBounds` value empty. That is only needed when the `sizes` |
| // specified could lead to out of bounds accesses. |
| Location loc = op->getLoc(); |
| LinalgOp linalgOp = cast<LinalgOp>(op); |
| SmallVector<Value> valuesToTile = linalgOp->getOperands(); |
| SmallVector<Value> tiledOperands = makeTiledShapes( |
| b, loc, linalgOp, valuesToTile, offsets, sizes, {}, true); |
| SmallVector<Operation *> generatedSlices = llvm::map_to_vector( |
| llvm::make_filter_range( |
| tiledOperands, |
| [](Value v) -> bool { |
| return isa_and_nonnull<tensor::ExtractSliceOp, memref::SubViewOp>( |
| v.getDefiningOp()); |
| }), |
| [](Value v) -> Operation * { return v.getDefiningOp(); }); |
| |
| SmallVector<Type> resultTensorTypes = |
| getTensorOutputTypes(linalgOp, tiledOperands); |
| |
| Operation *tiledOp = clone(b, linalgOp, resultTensorTypes, tiledOperands); |
| offsetIndices(b, cast<LinalgOp>(tiledOp), offsets); |
| |
| return TilingResult{ |
| {tiledOp}, SmallVector<Value>(tiledOp->getResults()), generatedSlices}; |
| } |
| |
| /// Utility to fetch the offsets and sizes when applied as per the indexing |
| /// map of the linalg op. This helps in fusing the linalg op as a consumer of |
| /// a given slice op. |
| static LogicalResult |
| getMappedOffsetAndSize(LinalgOp linalgOp, OpBuilder &b, |
| ArrayRef<AffineMap> indexingMaps, |
| ArrayRef<SmallVector<OpFoldResult>> allOffsets, |
| ArrayRef<SmallVector<OpFoldResult>> allSizes, |
| SmallVectorImpl<OpFoldResult> &mappedOffsetsVec, |
| SmallVectorImpl<OpFoldResult> &mappedSizesVec) { |
| DenseMap<unsigned, OpFoldResult> mappedOffsets, mappedSizes; |
| |
| for (auto [indexingMap, offsets, sizes] : |
| llvm::zip_equal(indexingMaps, allOffsets, allSizes)) { |
| for (auto [resultExpr, offset, size] : |
| llvm::zip_equal(indexingMap.getResults(), offsets, sizes)) { |
| auto dimExpr = dyn_cast<AffineDimExpr>(resultExpr); |
| if (!dimExpr) |
| continue; |
| unsigned position = dimExpr.getPosition(); |
| auto it = mappedOffsets.find(position); |
| if (it != mappedOffsets.end()) { |
| OpFoldResult seenOffset = it->second; |
| OpFoldResult seenSize = mappedSizes.lookup(position); |
| if (seenOffset != offset || seenSize != size) { |
| LLVM_DEBUG({ |
| llvm::dbgs() << "inconsistent iteration space mapping from " |
| "offsets/sizes of operands/results"; |
| }); |
| return failure(); |
| } |
| } else { |
| mappedOffsets[position] = offset; |
| mappedSizes[position] = size; |
| } |
| } |
| } |
| |
| // Aggregate from the given operand offsets and sizes, or default to |
| // iteration space values. |
| SmallVector<Range> iterationDomain = |
| cast<TilingInterface>(linalgOp.getOperation()).getIterationDomain(b); |
| mappedOffsetsVec.resize(iterationDomain.size()); |
| mappedSizesVec.resize(iterationDomain.size()); |
| for (auto [index, domain] : llvm::enumerate(iterationDomain)) { |
| auto it = mappedOffsets.find(index); |
| if (it != mappedOffsets.end()) { |
| mappedOffsetsVec[index] = it->second; |
| mappedSizesVec[index] = mappedSizes.lookup(index); |
| continue; |
| } |
| mappedOffsetsVec[index] = domain.offset; |
| mappedSizesVec[index] = domain.size; |
| } |
| return success(); |
| } |
| |
| /// Method to return the position of the result tile computed by the tiled |
| /// operation. |
| LogicalResult getIterationDomainTileFromOperandTiles( |
| Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers, |
| ArrayRef<SmallVector<OpFoldResult>> allOffsets, |
| ArrayRef<SmallVector<OpFoldResult>> allSizes, |
| SmallVectorImpl<OpFoldResult> &iterDomainOffsets, |
| SmallVectorImpl<OpFoldResult> &iterDomainSizes) const { |
| auto linalgOp = cast<LinalgOp>(op); |
| |
| std::optional<SmallVector<OpFoldResult>> iterationSpaceOffsets, |
| iterationSpaceSizes; |
| SmallVector<AffineMap> indexingMaps = |
| llvm::map_to_vector(operandNumbers, [&](unsigned operandNumber) { |
| OpOperand &opOperand = linalgOp->getOpOperand(operandNumber); |
| return linalgOp.getMatchingIndexingMap(&opOperand); |
| }); |
| if (failed(getMappedOffsetAndSize(linalgOp, b, indexingMaps, allOffsets, |
| allSizes, iterDomainOffsets, |
| iterDomainSizes))) { |
| return failure(); |
| } |
| return success(); |
| } |
| |
| /// Return the details of the output tile generated by the tiled |
| /// implementation. |
| LogicalResult |
| getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, |
| ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes, |
| SmallVector<OpFoldResult> &resultOffsets, |
| SmallVector<OpFoldResult> &resultSizes) const { |
| Location loc = op->getLoc(); |
| LinalgOp linalgOp = cast<LinalgOp>(op); |
| |
| AffineExpr d0; |
| bindDims(b.getContext(), d0); |
| SmallVector<OpFoldResult> subShapeSizes = |
| llvm::to_vector(llvm::map_range(sizes, [&](OpFoldResult ofr) { |
| return affine::makeComposedFoldedAffineApply(b, loc, d0 - 1, ofr); |
| })); |
| |
| OpOperand *outOperand = linalgOp.getDpsInitOperand(resultNumber); |
| SliceParameters sliceParams = computeSliceParameters( |
| b, loc, outOperand->get(), sizes, |
| linalgOp.getMatchingIndexingMap(outOperand), offsets, |
| /*ubs*/ {}, subShapeSizes, true); |
| resultOffsets = sliceParams.offsets; |
| resultSizes = sliceParams.sizes; |
| return success(); |
| } |
| |
| LogicalResult getIterationDomainTileFromResultTile( |
| Operation *op, OpBuilder &b, unsigned resultNumber, |
| ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes, |
| SmallVectorImpl<OpFoldResult> &iterDomainOffsets, |
| SmallVectorImpl<OpFoldResult> &iterDomainSizes) const { |
| auto linalgOp = cast<LinalgOp>(op); |
| |
| // Check that the indexing map used for the output is a projected |
| // permutation. This could be relaxed with a more general approach that can |
| // map the offsets and sizes from the result to iteration space tiles |
| // (filling in full extent for dimensions not used to access the result). |
| AffineMap indexingMap = |
| linalgOp.getIndexingMapMatchingResult(op->getResult(resultNumber)); |
| if (!indexingMap.isProjectedPermutation()) { |
| return op->emitOpError( |
| "unhandled tiled implementation generation when result is not " |
| "accessed using a permuted projection"); |
| } |
| |
| SmallVector<OpFoldResult> allOffsets = llvm::to_vector(offsets); |
| SmallVector<OpFoldResult> allSizes = llvm::to_vector(sizes); |
| auto status = |
| getMappedOffsetAndSize(linalgOp, b, indexingMap, {allOffsets}, |
| {allSizes}, iterDomainOffsets, iterDomainSizes); |
| (void)status; |
| assert(succeeded(status) && "unexpected error in offset calculation"); |
| return success(); |
| } |
| |
| FailureOr<TilingResult> |
| generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, |
| ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes) const { |
| SmallVector<OpFoldResult> mappedOffsets, mappedSizes; |
| if (failed(getIterationDomainTileFromResultTile( |
| op, b, resultNumber, offsets, sizes, mappedOffsets, mappedSizes))) { |
| return failure(); |
| } |
| auto tilingInterfaceOp = cast<TilingInterface>(op); |
| FailureOr<TilingResult> tilingResult = |
| tilingInterfaceOp.getTiledImplementation(b, mappedOffsets, mappedSizes); |
| |
| if (failed(tilingResult)) |
| return failure(); |
| |
| if (tilingResult->tiledOps.size() != 1) |
| return op->emitOpError("failed to generate tiled implementation"); |
| |
| return TilingResult{ |
| tilingResult->tiledOps, |
| SmallVector<Value>{tilingResult->tiledValues[resultNumber]}, |
| tilingResult->generatedSlices}; |
| } |
| |
| /// Method to generate the tiled implementation of an operation from the tile |
| /// of the operand. |
| FailureOr<TilingResult> getTiledImplementationFromOperandTiles( |
| Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers, |
| ArrayRef<SmallVector<OpFoldResult>> allOffsets, |
| ArrayRef<SmallVector<OpFoldResult>> allSizes) const { |
| SmallVector<OpFoldResult> mappedOffsets, mappedSizes; |
| if (failed(getIterationDomainTileFromOperandTiles( |
| op, b, operandNumbers, allOffsets, allSizes, mappedOffsets, |
| mappedSizes))) { |
| return failure(); |
| } |
| return getTiledImplementation(op, b, mappedOffsets, mappedSizes); |
| } |
| |
| LogicalResult generateScalarImplementation(Operation *op, OpBuilder &builder, |
| Location loc, |
| ValueRange ivs) const { |
| auto linalgOp = cast<LinalgOp>(op); |
| if (!linalgOp.hasPureBufferSemantics()) |
| return op->emitOpError("expected operation to have buffer semantics"); |
| |
| SmallVector<Value> indexedValues; |
| indexedValues.reserve(linalgOp->getNumOperands()); |
| Location linalgOpLoc = op->getLoc(); |
| /// Load the data corresponding to the block arguments that |
| /// represent input operands. |
| for (OpOperand &operand : linalgOp->getOpOperands()) { |
| if (!linalgOp.payloadUsesValueFromOperand(&operand)) { |
| indexedValues.push_back(nullptr); |
| continue; |
| } |
| if (linalgOp.isScalar(&operand)) { |
| indexedValues.push_back(operand.get()); |
| continue; |
| } |
| SmallVector<Value> indices = getIndicesForAccess( |
| builder, linalgOpLoc, linalgOp.getMatchingIndexingMap(&operand), ivs); |
| Value load = |
| memref::LoadOp::create(builder, linalgOpLoc, operand.get(), indices); |
| indexedValues.push_back(load); |
| } |
| |
| /// Inline the op payload and store the result. |
| return inlinePayload(builder, linalgOp, ivs, indexedValues); |
| } |
| }; |
| |
| //===----------------------------------------------------------------------===// |
| // External Model for implementing `PartialReductionInterface` for `LinalgOp`s. |
| //===----------------------------------------------------------------------===// |
| |
| /// In a given set vector, get the position of a particular element. |
| std::optional<int> getPositionIn(const llvm::SetVector<unsigned> &reductionDims, |
| unsigned value) { |
| for (auto [index, reductionDim] : llvm::enumerate(reductionDims)) { |
| if (reductionDim == value) { |
| return index; |
| } |
| } |
| return std::nullopt; |
| } |
| |
| /// Return an AffineMaps to use for the `outs` operands of the linalg op |
| /// generated for partial results. The new AffineMap is the AffineMap of the |
| /// untiled op with reduction dimensions appended at end in order in which they |
| /// were specified during tiling. |
| static SmallVector<AffineMap> |
| getPartialResultAffineMaps(LinalgOp linalgOp, |
| const SetVector<unsigned> &reductionDims) { |
| auto partialReductionMaps = llvm::map_to_vector( |
| linalgOp.getDpsInitsMutable(), [&](OpOperand &opOperand) { |
| AffineMap map = linalgOp.getMatchingIndexingMap(&opOperand); |
| for (auto redPos : reductionDims) { |
| map = |
| map.insertResult(getAffineDimExpr(redPos, linalgOp.getContext()), |
| map.getNumResults()); |
| } |
| return map; |
| }); |
| return partialReductionMaps; |
| } |
| |
| struct InitSliceInfo { |
| SmallVector<int64_t> resultShape; |
| SmallVector<OpFoldResult> offsets; |
| SmallVector<OpFoldResult> sizes; |
| SmallVector<OpFoldResult> strides; |
| }; |
| |
| /// Return the result shape, offsets, sizes and strides of the slice of the |
| /// `initValue` to use as the destination of the partial reduction op generated |
| /// with outer reduction strategy. |
| static InitSliceInfo getInitSliceInfoForOuterReduction( |
| MLIRContext *context, ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes, const SetVector<unsigned> &reductionDims, |
| ArrayRef<OpFoldResult> splitReductionIvs, AffineMap partialReductionMap) { |
| int64_t initRank = partialReductionMap.getNumResults(); |
| SmallVector<OpFoldResult> initOffsets, initSizes; |
| Attribute zero = IntegerAttr::get(IndexType::get(context), 0); |
| Attribute one = IntegerAttr::get(IndexType::get(context), 1); |
| SmallVector<OpFoldResult> initStrides(initRank, one); |
| for (AffineExpr dimExpr : partialReductionMap.getResults()) { |
| unsigned dim = cast<AffineDimExpr>(dimExpr).getPosition(); |
| if (reductionDims.contains(dim)) { |
| initOffsets.push_back(zero); |
| } else { |
| initOffsets.push_back(offsets[dim]); |
| } |
| initSizes.push_back(sizes[dim]); |
| } |
| SmallVector<int64_t> resultShape; |
| std::tie(resultShape, std::ignore) = decomposeMixedValues(initSizes); |
| return {resultShape, initOffsets, initSizes, initStrides}; |
| } |
| |
| /// Return the result shape, offsets, sizes and strides of the slice of the |
| /// `initValue` to use as destination of the partial reduction op generated with |
| /// outer parallel strategy. |
| static InitSliceInfo getInitSliceInfoForOuterParallel( |
| MLIRContext *context, ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes, const SetVector<unsigned> &reductionDims, |
| ArrayRef<OpFoldResult> splitReductionIvs, AffineMap partialReductionMap) { |
| int64_t initRank = partialReductionMap.getNumResults(); |
| SmallVector<OpFoldResult> initOffsets, initSizes; |
| Attribute one = IntegerAttr::get(IndexType::get(context), 1); |
| SmallVector<OpFoldResult> initStrides(initRank, one); |
| SmallVector<OpFoldResult> resultShape; |
| for (AffineExpr dimExpr : partialReductionMap.getResults()) { |
| unsigned dim = cast<AffineDimExpr>(dimExpr).getPosition(); |
| if (std::optional<unsigned> dimPos = getPositionIn(reductionDims, dim)) { |
| initOffsets.push_back(splitReductionIvs[dimPos.value()]); |
| initSizes.push_back(one); |
| } else { |
| initOffsets.push_back(offsets[dim]); |
| initSizes.push_back(sizes[dim]); |
| resultShape.push_back(sizes[dim]); |
| } |
| } |
| SmallVector<int64_t> staticShapes; |
| std::tie(staticShapes, std::ignore) = decomposeMixedValues(resultShape); |
| return {staticShapes, initOffsets, initSizes, initStrides}; |
| } |
| |
| /// Return the result shape, offsets, sizes and strides of the slice of the |
| /// `initValue` to use as destination of the partial reduction op. |
| static InitSliceInfo getInitSliceInfo(MLIRContext *context, |
| ReductionTilingStrategy strategy, |
| ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes, |
| const SetVector<unsigned> &reductionDims, |
| ArrayRef<OpFoldResult> splitReductionIvs, |
| AffineMap partialReductionMap) { |
| if (strategy == ReductionTilingStrategy::PartialReductionOuterReduction) { |
| return getInitSliceInfoForOuterReduction(context, offsets, sizes, |
| reductionDims, splitReductionIvs, |
| partialReductionMap); |
| } |
| assert(strategy == ReductionTilingStrategy::PartialReductionOuterParallel && |
| "unexpected ReductionTilingStrategy"); |
| return getInitSliceInfoForOuterParallel(context, offsets, sizes, |
| reductionDims, splitReductionIvs, |
| partialReductionMap); |
| } |
| |
| /// External model implementation of PartialReductionInterface for |
| /// LinalgOps. |
| template <typename LinalgOpTy> |
| struct LinalgOpPartialReductionInterface |
| : public PartialReductionOpInterface::ExternalModel< |
| LinalgOpPartialReductionInterface<LinalgOpTy>, LinalgOpTy> { |
| FailureOr<SmallVector<Value>> generateInitialTensorForPartialReduction( |
| Operation *op, OpBuilder &b, Location loc, ArrayRef<OpFoldResult> sizes, |
| const SetVector<unsigned> &reductionDims) const { |
| auto linalgOp = cast<LinalgOp>(op); |
| |
| OpBuilder::InsertionGuard guard(b); |
| if (linalgOp.hasPureBufferSemantics()) |
| return op->emitOpError("expected operation to have tensor semantics"); |
| |
| SmallVector<AffineMap> partialResultMaps = |
| getPartialResultAffineMaps(linalgOp, reductionDims); |
| |
| SmallVector<Value> inits; |
| for (auto [initIdx, result, partialMap] : |
| llvm::enumerate(linalgOp->getResults(), partialResultMaps)) { |
| SmallVector<Operation *, 4> combinerOps; |
| if (!matchReduction(linalgOp.getRegionOutputArgs(), initIdx, |
| combinerOps) || |
| combinerOps.size() != 1) |
| return op->emitOpError("Failed to anaysis the reduction operation."); |
| |
| Operation *reductionOp = combinerOps[0]; |
| std::optional<TypedAttr> identity = arith::getNeutralElement(reductionOp); |
| if (!identity.has_value()) |
| return op->emitOpError( |
| "Failed to get an identity value for the reduction operation."); |
| |
| // Append the new partial result dimensions. |
| SmallVector<OpFoldResult> partialResultShape; |
| for (AffineExpr dimExpr : partialMap.getResults()) { |
| auto dim = cast<AffineDimExpr>(dimExpr); |
| partialResultShape.push_back(sizes[dim.getPosition()]); |
| } |
| |
| Type elType = getElementTypeOrSelf(result.getType()); |
| Value emptyTensor = |
| tensor::EmptyOp::create(b, loc, partialResultShape, elType); |
| Value constantOp = arith::ConstantOp::create(b, loc, *identity); |
| auto identityTensor = |
| linalg::FillOp::create(b, loc, constantOp, emptyTensor); |
| inits.push_back(identityTensor.getResult(0)); |
| } |
| |
| return inits; |
| } |
| |
| FailureOr<TilingResult> |
| tileToPartialReduction(Operation *op, OpBuilder &b, Location loc, |
| ReductionTilingStrategy tilingStrategy, |
| ValueRange init, ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes, |
| const SetVector<unsigned> &reductionDims, |
| ArrayRef<OpFoldResult> splitReductionIvs) const { |
| OpBuilder::InsertionGuard guard(b); |
| auto linalgOp = cast<LinalgOp>(op); |
| |
| SmallVector<AffineMap> partialReductionMaps = |
| getPartialResultAffineMaps(linalgOp, reductionDims); |
| |
| // Step 1. Extend init maps to have reduction dimension dims, since we |
| // are converting them to parallel dimensions. |
| SmallVector<AffineMap> newInitMaps; |
| if (tilingStrategy == |
| ReductionTilingStrategy::PartialReductionOuterReduction) { |
| newInitMaps = llvm::to_vector(partialReductionMaps); |
| } else { |
| newInitMaps = llvm::map_to_vector( |
| linalgOp.getDpsInitsMutable(), [&](OpOperand &opOperand) { |
| return linalgOp.getMatchingIndexingMap(&opOperand); |
| }); |
| } |
| |
| // Step 2a: Extract a slice of the input operands. |
| SmallVector<Value> tiledInputs = makeTiledShapes( |
| b, loc, linalgOp, linalgOp.getDpsInputs(), offsets, sizes, {}, true); |
| SmallVector<Operation *> generatedSlices = llvm::map_to_vector( |
| llvm::make_filter_range( |
| tiledInputs, [](Value v) -> bool { return v.getDefiningOp(); }), |
| [](Value v) -> Operation * { return v.getDefiningOp(); }); |
| |
| // Step 2b: Extract a slice of the init operands. |
| SmallVector<Value, 1> tiledInits; |
| for (auto [partialReductionMap, valueToTile] : |
| llvm::zip_equal(partialReductionMaps, init)) { |
| InitSliceInfo sliceInfo = getInitSliceInfo( |
| b.getContext(), tilingStrategy, offsets, sizes, reductionDims, |
| splitReductionIvs, partialReductionMap); |
| auto valueToTileType = cast<RankedTensorType>(valueToTile.getType()); |
| RankedTensorType sliceResultType = RankedTensorType::get( |
| sliceInfo.resultShape, valueToTileType.getElementType(), |
| valueToTileType.getEncoding()); |
| auto sliceOp = tensor::ExtractSliceOp::create( |
| b, loc, sliceResultType, valueToTile, sliceInfo.offsets, |
| sliceInfo.sizes, sliceInfo.strides); |
| tiledInits.push_back(sliceOp.getResult()); |
| generatedSlices.push_back(sliceOp); |
| } |
| |
| // Update the indexing maps. |
| SmallVector<AffineMap> newMaps = linalgOp.getIndexingMapsArray(); |
| for (auto [initOperand, newInitMap] : |
| llvm::zip_equal(linalgOp.getDpsInitsMutable(), newInitMaps)) { |
| int mapIdx = linalgOp.getIndexingMapIndex(&initOperand); |
| newMaps[mapIdx] = newInitMap; |
| } |
| |
| // Step 3. Change the reduction dim iterator types. |
| SmallVector<utils::IteratorType> newIteratorTypes = |
| linalgOp.getIteratorTypesArray(); |
| if (tilingStrategy == |
| ReductionTilingStrategy::PartialReductionOuterReduction) { |
| for (int dim : reductionDims) |
| newIteratorTypes[dim] = utils::IteratorType::parallel; |
| } |
| |
| // Step 4. Create the new generic op. |
| Operation *partialReductionOp; |
| auto resultTypes = ValueRange(tiledInits).getTypes(); |
| if (tilingStrategy == |
| ReductionTilingStrategy::PartialReductionOuterReduction) { |
| auto genericOp = GenericOp::create(b, loc, resultTypes, tiledInputs, |
| tiledInits, newMaps, newIteratorTypes); |
| IRMapping mapping; |
| op->getRegion(0).cloneInto(&genericOp.getRegion(), |
| genericOp.getRegion().begin(), mapping); |
| partialReductionOp = genericOp.getOperation(); |
| } else { |
| SmallVector<Value> operands = std::move(tiledInputs); |
| llvm::append_range(operands, tiledInits); |
| partialReductionOp = mlir::clone(b, op, resultTypes, operands); |
| } |
| return TilingResult{ |
| {partialReductionOp}, |
| llvm::map_to_vector(partialReductionOp->getResults(), |
| [](OpResult r) -> Value { return r; }), |
| generatedSlices}; |
| } |
| |
| FailureOr<MergeResult> |
| mergeReductions(Operation *op, OpBuilder &b, Location loc, |
| ValueRange partialReduce, |
| const SetVector<unsigned> &reductionDims) const { |
| auto linalgOp = cast<LinalgOp>(op); |
| SmallVector<AffineMap> partialReductionMaps = |
| getPartialResultAffineMaps(linalgOp, reductionDims); |
| |
| // Permute the reduction dims as permuted by the partial result map. |
| SmallVector<Operation *> mergeOperations; |
| SmallVector<Value> replacements; |
| for (auto [idx, init, partialResult, partialMap] : llvm::enumerate( |
| linalgOp.getDpsInits(), partialReduce, partialReductionMaps)) { |
| unsigned initIdx = idx; |
| // linalg.reduce's iteration space is the tiled result's iteration space |
| // (and not the tiled operation's iteration space). To account for this, |
| // permute the reduction dimensions based on the partial result map of the |
| // tiled result. |
| SmallVector<int64_t> partialReductionDims; |
| for (auto [resultNum, dimExpr] : |
| llvm::enumerate(partialMap.getResults())) { |
| unsigned dim = cast<AffineDimExpr>(dimExpr).getPosition(); |
| if (llvm::is_contained(reductionDims, dim)) { |
| partialReductionDims.push_back(resultNum); |
| } |
| } |
| |
| auto reduction = linalg::ReduceOp::create( |
| b, loc, partialResult, init, partialReductionDims, |
| [&linalgOp, &initIdx](OpBuilder &b, Location loc, ValueRange inputs) { |
| // Get the combiner op. |
| SmallVector<Operation *, 4> combinerOps; |
| matchReduction(linalgOp.getRegionOutputArgs(), initIdx, |
| combinerOps); |
| Operation *clonedReductionOp = b.clone(*combinerOps[0]); |
| // Combine the input at idx and output at numInits + idx. |
| clonedReductionOp->setOperand(0, inputs[0]); |
| clonedReductionOp->setOperand(1, inputs[1]); |
| linalg::YieldOp::create(b, loc, clonedReductionOp->getResult(0)); |
| }); |
| |
| mergeOperations.push_back(reduction); |
| replacements.push_back(reduction->getResult(0)); |
| } |
| |
| return MergeResult{mergeOperations, replacements}; |
| } |
| |
| LogicalResult getPartialResultTilePosition( |
| Operation *op, OpBuilder &b, unsigned resultNumber, |
| ReductionTilingStrategy tilingStrategy, ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes, const SetVector<unsigned> &reductionDims, |
| ArrayRef<OpFoldResult> splitReductionIvs, |
| SmallVector<OpFoldResult> &resultOffsets, |
| SmallVector<OpFoldResult> &resultSizes) const { |
| auto linalgOp = cast<LinalgOp>(op); |
| SmallVector<AffineMap> partialReductionMaps = |
| getPartialResultAffineMaps(linalgOp, reductionDims); |
| InitSliceInfo sliceInfo = getInitSliceInfo( |
| b.getContext(), tilingStrategy, offsets, sizes, reductionDims, |
| splitReductionIvs, partialReductionMaps[resultNumber]); |
| std::swap(resultOffsets, sliceInfo.offsets); |
| std::swap(resultSizes, sliceInfo.sizes); |
| |
| return success(); |
| } |
| }; |
| |
| template <typename OpTy> |
| static SmallVector<Range> getPackUnPackIterationDomain(OpTy op, |
| OpBuilder &builder) { |
| static_assert(llvm::is_one_of<OpTy, PackOp, UnPackOp>::value, |
| "applies to only pack or unpack operations"); |
| OpBuilder::InsertionGuard g(builder); |
| int64_t rank = (std::is_same<OpTy, PackOp>::value) ? op.getSourceRank() |
| : op.getDestRank(); |
| OpFoldResult zero = builder.getIndexAttr(0); |
| OpFoldResult one = builder.getIndexAttr(1); |
| ReifiedRankedShapedTypeDims resultShape; |
| (void)reifyResultShapes(builder, op, resultShape); |
| SmallVector<Range> loopBounds(rank); |
| for (auto dim : llvm::seq<int64_t>(0, rank)) { |
| loopBounds[dim].offset = zero; |
| loopBounds[dim].stride = one; |
| loopBounds[dim].size = resultShape[0][dim]; |
| } |
| return loopBounds; |
| } |
| |
| static void applyPermToRange(SmallVector<OpFoldResult> &offsets, |
| SmallVector<OpFoldResult> &sizes, |
| ArrayRef<int64_t> permutation) { |
| if (permutation.empty()) |
| return; |
| applyPermutationToVector<OpFoldResult>(offsets, permutation); |
| applyPermutationToVector<OpFoldResult>(sizes, permutation); |
| } |
| |
| struct PackOpTiling |
| : public TilingInterface::ExternalModel<PackOpTiling, linalg::PackOp> { |
| |
| SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const { |
| // Note that here we only consider untiled dimensions and outer tiled data |
| // dimensions, the inner tiled data dimensions are materialized when |
| // building the body of the operation. |
| auto packOp = cast<PackOp>(op); |
| SmallVector<utils::IteratorType> iteratorTypes( |
| packOp.getSourceRank(), utils::IteratorType::parallel); |
| return iteratorTypes; |
| } |
| |
| SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const { |
| return getPackUnPackIterationDomain<PackOp>(cast<PackOp>(op), b); |
| } |
| |
| FailureOr<TilingResult> |
| getTiledImplementation(Operation *op, OpBuilder &b, |
| ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes) const { |
| auto packOp = cast<PackOp>(op); |
| Location loc = packOp.getLoc(); |
| |
| // The tiling is applied on interchanged dimensions. We have to undo the |
| // interchange to map sizes and offsets to the original input. |
| int64_t inputRank = packOp.getSourceRank(); |
| SmallVector<OpFoldResult> origOffsets(offsets); |
| SmallVector<OpFoldResult> origSizes(sizes); |
| applyPermToRange(origOffsets, origSizes, |
| invertPermutationVector(packOp.getOuterDimsPerm())); |
| |
| DenseMap<int64_t, OpFoldResult> dimAndTileMapping = |
| packOp.getDimAndTileMapping(); |
| SmallVector<OpFoldResult> srcDimValues = |
| tensor::getMixedSizes(b, loc, packOp.getSource()); |
| SmallVector<OpFoldResult> inputIndices, inputSizes; |
| for (auto dim : llvm::seq<int64_t>(0, inputRank)) { |
| using AV = affine::AffineValueExpr; |
| affine::AffineBuilder ab(b, loc); |
| AffineExpr dim0, dim1, sym; |
| bindDims(b.getContext(), dim0, dim1); |
| bindSymbols(b.getContext(), sym); |
| if (dimAndTileMapping.count(dim)) { |
| // If the data dimension is tiled, the i-th index is the product of |
| // offset_i and tile_i, and the i-th size is the product of sizes_i and |
| // tile_i. |
| auto avOffset = AV(dim0).bind(origOffsets[dim]); |
| auto avSize = AV(dim0).bind(origSizes[dim]); |
| auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]); |
| inputIndices.push_back(ab.mul(avOffset, avTileSize)); |
| inputSizes.push_back(ab.mul(avSize, avTileSize)); |
| } else { |
| inputIndices.push_back(origOffsets[dim]); |
| inputSizes.push_back(origSizes[dim]); |
| } |
| |
| // Limit the size of the input operand for incomplete tiles. |
| if (packOp.getPaddingValue()) { |
| OpFoldResult dimSize = srcDimValues[dim]; |
| auto avDimSize = AV(dim0).bind(dimSize); |
| auto avInputIdx = AV(dim1).bind(inputIndices.back()); |
| inputSizes.back() = |
| ab.min({inputSizes.back(), ab.sub(avDimSize, avInputIdx)}); |
| } |
| } |
| |
| auto oneAttr = b.getI64IntegerAttr(1); |
| SmallVector<OpFoldResult> strides(inputRank, oneAttr); |
| |
| SmallVector<Value> tiledOperands; |
| auto sourceSlice = tensor::ExtractSliceOp::create( |
| b, loc, packOp.getSource(), inputIndices, inputSizes, strides); |
| tiledOperands.push_back(sourceSlice); |
| |
| SmallVector<OpFoldResult> outputOffsets, outputSizes; |
| if (failed(getResultTilePosition(op, b, 0, offsets, sizes, outputOffsets, |
| outputSizes))) |
| return {}; |
| |
| strides.append(packOp.getDestRank() - inputRank, oneAttr); |
| auto outSlice = tensor::ExtractSliceOp::create( |
| b, loc, packOp.getDest(), outputOffsets, outputSizes, strides); |
| tiledOperands.push_back(outSlice); |
| |
| if (auto val = packOp.getPaddingValue()) |
| tiledOperands.push_back(val); |
| for (auto tile : packOp.getInnerTiles()) |
| tiledOperands.push_back(tile); |
| |
| Operation *tiledPackOp = PackOp::create( |
| b, loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs()); |
| |
| return TilingResult{ |
| {tiledPackOp}, |
| SmallVector<Value>(tiledPackOp->getResults()), |
| llvm::to_vector(ArrayRef<Operation *>{sourceSlice, outSlice})}; |
| } |
| |
| LogicalResult |
| getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, |
| ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes, |
| SmallVector<OpFoldResult> &resultOffsets, |
| SmallVector<OpFoldResult> &resultSizes) const { |
| // The iteration domain is over outer dimensions of packed layout. In this |
| // context, the outer dimensions of `resultOffsets` are `offsets`. The |
| // inner dimensions of `resultOffsets` are zeros because tiling is not |
| // applied to them. |
| auto packOp = cast<PackOp>(op); |
| int64_t inputRank = packOp.getSourceRank(); |
| int64_t outputRank = packOp.getDestRank(); |
| auto zeroAttr = b.getI64IntegerAttr(0); |
| resultOffsets.assign(offsets.begin(), offsets.end()); |
| resultOffsets.append(outputRank - inputRank, zeroAttr); |
| |
| ReifiedRankedShapedTypeDims outputShape; |
| (void)reifyResultShapes(b, packOp, outputShape); |
| resultSizes.assign(sizes.begin(), sizes.end()); |
| for (auto dataTileDim : llvm::seq<unsigned>(inputRank, outputRank)) |
| resultSizes.push_back(outputShape[0][dataTileDim]); |
| |
| return success(); |
| } |
| |
| FailureOr<TilingResult> |
| generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, |
| ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes) const { |
| auto packOp = cast<PackOp>(op); |
| int64_t numTiles = packOp.getInnerDimsPos().size(); |
| |
| // tensor.pack op is fusible (as a producer) only if full inner tiles are |
| // iterated or inner dims are not tiled. Otherwise, it will generate a |
| // sequence of non-trivial ops (for partial tiles). |
| for (auto offset : offsets.take_back(numTiles)) |
| if (!isZeroInteger(offset)) |
| return failure(); |
| |
| for (auto iter : |
| llvm::zip_equal(packOp.getMixedTiles(), sizes.take_back(numTiles))) |
| if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter))) |
| return failure(); |
| |
| FailureOr<TilingResult> tilingResult = getTiledImplementation( |
| op, b, offsets.drop_back(numTiles), sizes.drop_back(numTiles)); |
| if (failed(tilingResult)) |
| return failure(); |
| return tilingResult.value(); |
| } |
| |
| /// Method to return the position of iteration domain tile computed by the |
| /// tiled operation. In current `tensor.pack` context, the `resultOffsets` and |
| /// `resultSizes` only cover outer dimensions. |
| LogicalResult getIterationDomainTileFromOperandTiles( |
| Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers, |
| ArrayRef<SmallVector<OpFoldResult>> allOffsets, |
| ArrayRef<SmallVector<OpFoldResult>> allSizes, |
| SmallVectorImpl<OpFoldResult> &resultOffsets, |
| SmallVectorImpl<OpFoldResult> &resultSizes) const { |
| if (operandNumbers.size() != 1 || operandNumbers[0] != 0) { |
| LLVM_DEBUG( |
| { llvm::dbgs() << "unsupported operands for consumer fusion"; }); |
| return failure(); |
| } |
| |
| ArrayRef<OpFoldResult> offsets(allOffsets[0]); |
| ArrayRef<OpFoldResult> sizes(allSizes[0]); |
| auto packOp = cast<PackOp>(op); |
| Location loc = packOp.getLoc(); |
| SmallVector<OpFoldResult> outerDimOffsets, outerDimSizes; |
| DenseMap<int64_t, OpFoldResult> dimAndTileMapping = |
| packOp.getDimAndTileMapping(); |
| SmallVector<int64_t> outerShapeWithoutTranspose( |
| packOp.getDestType().getShape().take_front(packOp.getSourceRank())); |
| if (!packOp.getOuterDimsPerm().empty()) { |
| applyPermutationToVector( |
| outerShapeWithoutTranspose, |
| invertPermutationVector(packOp.getOuterDimsPerm())); |
| } |
| for (auto dim : llvm::seq<int64_t>(packOp.getSourceRank())) { |
| if (dimAndTileMapping.count(dim)) { |
| FailureOr<int64_t> cstTileSize = |
| ValueBoundsConstraintSet::computeConstantBound( |
| presburger::BoundType::UB, sizes[dim], |
| /*stopCondition=*/nullptr, /*closedUB=*/true); |
| std::optional<int64_t> cstInnerSize = |
| getConstantIntValue(dimAndTileMapping[dim]); |
| |
| // If a dimension is not tiled, it is always valid to fuse the pack op, |
| // even if the op has padding semantics. Because it always generates a |
| // full slice along the dimension. The tile sizes are for unpacked |
| // domain, i.e., `srcDimSize`, so `tileSize < srcDimSize` means that the |
| // dimension is tiled. |
| // TODO: It could be untiled if the `srcDimSize` is dynamic. It is a |
| // hard check to determine if a dimension is tiled or not. |
| int64_t srcDimSize = packOp.getSourceType().getDimSize(dim); |
| int64_t destDimSize = outerShapeWithoutTranspose[dim]; |
| bool isTiled = failed(cstTileSize) || |
| ShapedType::isDynamic(srcDimSize) || |
| cstTileSize.value() < srcDimSize; |
| if (!isTiled) { |
| outerDimOffsets.push_back(offsets[dim]); |
| if (ShapedType::isStatic(destDimSize)) { |
| outerDimSizes.push_back(b.getIndexAttr(destDimSize)); |
| } else { |
| outerDimSizes.push_back( |
| b.createOrFold<tensor::DimOp>(loc, packOp.getDest(), dim)); |
| } |
| continue; |
| } |
| |
| // Currently fusing `packOp` as consumer only expects perfect tiling |
| // scenario because even if without padding semantic, the `packOp` may |
| // also yield incomplete tiles. E.g. tensor<30xf32> -> tensor<5x6xf32>, |
| // where the `tileSize` from operand of `packOp` is 5, which is not |
| // exactly divided by `innerTile`(=6) of `packOp`. As the result: |
| // 1. the first slice is extracted from (0) to (4) and inserted into |
| // (0,0)~(0,4) at first row. |
| // 2. the second slice is extracted from (5) to (9) and SHOULD BE |
| // respectively inserted into two rows with different length, including |
| // first row: (0,5) and second row (1,0)~(1,3). It is hard to coordinate |
| // them, thus adding below constraint to bypass them temporarily. In |
| // another word, we can only support tiling with consumer if the tile |
| // size for the producer is a multiple of the inner tile size for the |
| // packed dimensions at this moment. |
| if ((failed(cstTileSize) || !cstInnerSize || |
| *cstTileSize % *cstInnerSize != 0)) |
| return failure(); |
| |
| using AV = affine::AffineValueExpr; |
| affine::AffineBuilder ab(b, loc); |
| AffineExpr dim0, sym; |
| bindDims(b.getContext(), dim0); |
| bindSymbols(b.getContext(), sym); |
| auto avOffset = AV(dim0).bind(offsets[dim]); |
| auto avSize = AV(dim0).bind(sizes[dim]); |
| auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]); |
| outerDimOffsets.push_back(ab.floor(avOffset, avTileSize)); |
| outerDimSizes.push_back(ab.ceil(avSize, avTileSize)); |
| } else { |
| outerDimOffsets.push_back(offsets[dim]); |
| outerDimSizes.push_back(sizes[dim]); |
| } |
| } |
| applyPermToRange(outerDimOffsets, outerDimSizes, packOp.getOuterDimsPerm()); |
| resultOffsets = outerDimOffsets; |
| resultSizes = outerDimSizes; |
| return success(); |
| } |
| |
| /// Method to return the tiled implementation of tensor.pack as a consumer. |
| FailureOr<TilingResult> getTiledImplementationFromOperandTiles( |
| Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers, |
| ArrayRef<SmallVector<OpFoldResult>> allOffsets, |
| ArrayRef<SmallVector<OpFoldResult>> allSizes) const { |
| if (operandNumbers.size() != 1 || operandNumbers[0] != 0) { |
| LLVM_DEBUG( |
| { llvm ::dbgs() << "unhandled operands for consumer fusion"; }); |
| return failure(); |
| } |
| |
| ArrayRef<OpFoldResult> offsets(allOffsets[0]); |
| ArrayRef<OpFoldResult> sizes(allSizes[0]); |
| |
| auto packOp = cast<PackOp>(op); |
| Location loc = packOp.getLoc(); |
| |
| int64_t inputRank = packOp.getSourceRank(); |
| auto oneAttr = b.getI64IntegerAttr(1); |
| SmallVector<OpFoldResult> strides(inputRank, oneAttr); |
| |
| SmallVector<Value> tiledOperands; |
| auto sourceSlice = tensor::ExtractSliceOp::create( |
| b, loc, packOp.getSource(), offsets, sizes, strides); |
| tiledOperands.push_back(sourceSlice); |
| |
| SmallVector<OpFoldResult> outerDimOffsets, outerDimSizes; |
| if (failed(getIterationDomainTileFromOperandTiles( |
| op, b, operandNumbers, allOffsets, allSizes, outerDimOffsets, |
| outerDimSizes))) |
| return failure(); |
| |
| SmallVector<OpFoldResult> outputOffsets, outputSizes; |
| if (failed(getResultTilePosition(op, b, 0, outerDimOffsets, outerDimSizes, |
| outputOffsets, outputSizes))) |
| return failure(); |
| |
| strides.append(packOp.getDestRank() - inputRank, oneAttr); |
| auto outSlice = tensor::ExtractSliceOp::create( |
| b, loc, packOp.getDest(), outputOffsets, outputSizes, strides); |
| tiledOperands.push_back(outSlice); |
| |
| if (auto val = packOp.getPaddingValue()) |
| tiledOperands.push_back(val); |
| for (auto tile : packOp.getInnerTiles()) |
| tiledOperands.push_back(tile); |
| |
| Operation *tiledPackOp = PackOp::create( |
| b, loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs()); |
| |
| return TilingResult{ |
| {tiledPackOp}, |
| SmallVector<Value>(tiledPackOp->getResults()), |
| llvm::to_vector(ArrayRef<Operation *>{sourceSlice, outSlice})}; |
| } |
| }; |
| |
| struct UnpackTileDimInfo { |
| bool isAlignedToInnerTileSize; |
| OpFoldResult sourceOffset; |
| OpFoldResult sourceSize; |
| OpFoldResult resultOffset; |
| OpFoldResult destExpandedSize; |
| }; |
| |
| /// Returns the needed information for tiling unpack op on `tileDim` with given |
| /// `tileOffset` and `tileSize`. For more details, see the comment of the |
| /// `getTiledImplementation`. |
| static UnpackTileDimInfo getUnpackTileDimInfo(OpBuilder &b, UnPackOp unpackOp, |
| int64_t tileDim, |
| OpFoldResult tileOffset, |
| OpFoldResult tileSize) { |
| UnpackTileDimInfo info; |
| Attribute zeroAttr = b.getIndexAttr(0); |
| Attribute oneAttr = b.getIndexAttr(1); |
| DenseMap<int64_t, OpFoldResult> dimAndTileMapping = |
| unpackOp.getDimAndTileMapping(); |
| // The dimension is not one of packed data dimension. |
| if (!dimAndTileMapping.count(tileDim)) { |
| info.isAlignedToInnerTileSize = true; |
| info.sourceOffset = tileOffset; |
| info.sourceSize = tileSize; |
| info.resultOffset = zeroAttr; |
| info.destExpandedSize = tileSize; |
| return info; |
| } |
| |
| Location loc = unpackOp.getLoc(); |
| using AV = affine::AffineValueExpr; |
| affine::AffineBuilder ab(b, loc); |
| AffineExpr dim0, dim1, sym0; |
| bindDims(b.getContext(), dim0, dim1); |
| bindSymbols(b.getContext(), sym0); |
| |
| OpFoldResult innerTileSize = dimAndTileMapping[tileDim]; |
| |
| info.isAlignedToInnerTileSize = false; |
| FailureOr<int64_t> cstSize = ValueBoundsConstraintSet::computeConstantBound( |
| presburger::BoundType::UB, tileSize, |
| /*stopCondition=*/nullptr, /*closedUB=*/true); |
| std::optional<int64_t> cstInnerSize = getConstantIntValue(innerTileSize); |
| if (!failed(cstSize) && cstInnerSize) { |
| if (*cstSize % *cstInnerSize == 0) |
| info.isAlignedToInnerTileSize = true; |
| |
| // If the tiling size equals to the inner tiling size, the outer dims are |
| // always 1. |
| if (*cstInnerSize == *cstSize) { |
| auto lhs = AV(dim0).bind(tileOffset); |
| auto rhs = AV(dim1).bind(innerTileSize); |
| info.sourceOffset = ab.floor(lhs, rhs); |
| info.sourceSize = oneAttr; |
| info.resultOffset = zeroAttr; |
| info.destExpandedSize = tileSize; |
| return info; |
| } |
| } |
| |
| if (info.isAlignedToInnerTileSize) { |
| info.sourceOffset = |
| ab.floor(AV(dim0).bind(tileOffset), AV(dim1).bind(innerTileSize)); |
| info.resultOffset = zeroAttr; |
| info.destExpandedSize = tileSize; |
| |
| // The ceilDiv is needed here because there could be incomplete tile even |
| // it is perfect tiling cases. E.g., |
| // %0 = unpack tensor<33x2xf32> into tensor<64xf32> |
| // If the tiling size is 32, there will be 3 tiles. Two of them have |
| // size=32; one of them have size=2. The size is represented using |
| // affine_min op; we need ceilDiv. |
| info.sourceSize = |
| ab.ceil(AV(dim0).bind(tileSize), AV(dim1).bind(innerTileSize)); |
| return info; |
| } |
| |
| affine::DivModValue firstCoord = affine::getDivMod( |
| b, loc, getValueOrCreateConstantIndexOp(b, loc, tileOffset), |
| getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); |
| OpFoldResult tileExclusiveBound = |
| ab.add(AV(dim0).bind(tileOffset), AV(dim1).bind(tileSize)); |
| affine::DivModValue lastCoord = affine::getDivMod( |
| b, loc, |
| getValueOrCreateConstantIndexOp( |
| b, loc, |
| ab.sub(AV(dim0).bind(tileExclusiveBound), AV(dim1).bind(oneAttr))), |
| getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); |
| |
| OpFoldResult lengthMinusOne = ab.sub(AV(dim0).bind(lastCoord.quotient), |
| AV(dim1).bind(firstCoord.quotient)); |
| info.sourceSize = |
| ab.add(AV(dim0).bind(lengthMinusOne), AV(dim1).bind(oneAttr)); |
| info.sourceOffset = firstCoord.quotient; |
| info.resultOffset = firstCoord.remainder; |
| // Do not create an Affine ops for expanded size because the affine op is too |
| // complicated which would trigger an issue in affine ops simplification. |
| info.destExpandedSize = b.createOrFold<arith::MulIOp>( |
| loc, getValueOrCreateConstantIndexOp(b, loc, info.sourceSize), |
| getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); |
| return info; |
| } |
| |
| struct UnPackOpTiling |
| : public TilingInterface::ExternalModel<UnPackOpTiling, linalg::UnPackOp> { |
| |
| SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const { |
| auto unpackOp = cast<UnPackOp>(op); |
| SmallVector<utils::IteratorType> iteratorTypes( |
| unpackOp.getDestRank(), utils::IteratorType::parallel); |
| return iteratorTypes; |
| } |
| |
| SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const { |
| return getPackUnPackIterationDomain<UnPackOp>(cast<UnPackOp>(op), b); |
| } |
| |
| /// There are two cases in tiling unpack ops. If the tiling size is aligned to |
| /// the inner tile size, the corresponding tiles of source are all complete. |
| /// Otherwise, there are in-complete tiles. We will need to expand the slice |
| /// of source for getting complete tiles. The tiled unpack op unpacks more |
| /// data from source, so We'll need an extract_slice op to shift and truncate |
| /// the output. |
| /// Take Nn_to_N as an example. Say that N=32, n=8, and tiling_size=15. The |
| /// coordinates of second tile (i.e., result[15..31]) are |
| /// [(1, 7), (2, 0,), (2, 1) ... (3, 6), (3, 7)]. The first row and the last |
| /// row are incomplete tiles. To represent the unpack op, we have to complete |
| /// the rows. I.e., the input coordinates would start with (1, 0); end with |
| /// (3, 7). In this context, the tiled unpack produces a (3 * n) elements |
| /// because there are 3 rows in total. Follow by a tensor.extract_slice op, we |
| /// can get the actual result. |
| FailureOr<TilingResult> |
| getTiledImplementation(Operation *op, OpBuilder &b, |
| ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes) const { |
| auto unpackOp = cast<UnPackOp>(op); |
| int64_t srcRank = unpackOp.getSourceRank(); |
| int64_t destRank = unpackOp.getDestRank(); |
| int64_t numInnerTiles = srcRank - destRank; |
| Location loc = unpackOp.getLoc(); |
| |
| // The perfect tiling case indicates that the tiling sizes are multiple of |
| // inner_tile_size. In this context, no extra data is needed when |
| // representing the tiled unpack op. |
| bool isPerfectTilingCase = true; |
| Attribute oneAttr = b.getIndexAttr(1); |
| SmallVector<OpFoldResult> sliceSrcStrides(destRank, oneAttr); |
| SmallVector<OpFoldResult> sliceSrcIndices, sliceSrcSizes; |
| SmallVector<OpFoldResult> destExpandedSizes, resultOffsetsFromDest; |
| for (auto dim : llvm::seq<int64_t>(0, destRank)) { |
| UnpackTileDimInfo info = |
| getUnpackTileDimInfo(b, unpackOp, dim, offsets[dim], sizes[dim]); |
| if (!info.isAlignedToInnerTileSize) |
| isPerfectTilingCase = false; |
| sliceSrcIndices.push_back(info.sourceOffset); |
| sliceSrcSizes.push_back(info.sourceSize); |
| destExpandedSizes.push_back(info.destExpandedSize); |
| resultOffsetsFromDest.push_back(info.resultOffset); |
| } |
| |
| // The tiling is applied on destination dimensions. We have to apply the |
| // interchange on source dimensions if outer_dims_perm is set. |
| applyPermToRange(sliceSrcIndices, sliceSrcSizes, |
| unpackOp.getOuterDimsPerm()); |
| Attribute zeroAttr = b.getIndexAttr(0); |
| sliceSrcIndices.append(numInnerTiles, zeroAttr); |
| sliceSrcSizes.append(unpackOp.getMixedTiles()); |
| sliceSrcStrides.append(numInnerTiles, oneAttr); |
| SmallVector<Operation *> generatedSlices; |
| tensor::ExtractSliceOp sliceSource = tensor::ExtractSliceOp::create( |
| b, loc, unpackOp.getSource(), sliceSrcIndices, sliceSrcSizes, |
| sliceSrcStrides); |
| generatedSlices.push_back(sliceSource); |
| |
| SmallVector<OpFoldResult> destStrides(destRank, oneAttr); |
| Value sliceDest; |
| if (isPerfectTilingCase) { |
| auto destSliceOp = tensor::ExtractSliceOp::create( |
| b, loc, unpackOp.getDest(), offsets, sizes, destStrides); |
| sliceDest = destSliceOp; |
| generatedSlices.push_back(destSliceOp); |
| } else { |
| sliceDest = tensor::EmptyOp::create( |
| b, loc, destExpandedSizes, unpackOp.getDestType().getElementType()); |
| } |
| |
| SmallVector<Value> tiledOperands = {sliceSource.getResult(), sliceDest}; |
| for (auto tile : unpackOp.getInnerTiles()) |
| tiledOperands.push_back(tile); |
| |
| Operation *tiledUnpackOp = UnPackOp::create( |
| b, loc, TypeRange{sliceDest.getType()}, tiledOperands, op->getAttrs()); |
| |
| if (isPerfectTilingCase) |
| return TilingResult{{tiledUnpackOp}, |
| SmallVector<Value>(tiledUnpackOp->getResults()), |
| generatedSlices}; |
| |
| auto extractSlice = tensor::ExtractSliceOp::create( |
| b, loc, tiledUnpackOp->getResult(0), resultOffsetsFromDest, sizes, |
| destStrides); |
| return TilingResult{ |
| {tiledUnpackOp}, {extractSlice.getResult()}, generatedSlices}; |
| } |
| |
| LogicalResult |
| getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, |
| ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes, |
| SmallVector<OpFoldResult> &resultOffsets, |
| SmallVector<OpFoldResult> &resultSizes) const { |
| resultOffsets = llvm::to_vector(offsets); |
| resultSizes = llvm::to_vector(sizes); |
| return success(); |
| } |
| |
| FailureOr<TilingResult> |
| generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, |
| ArrayRef<OpFoldResult> offsets, |
| ArrayRef<OpFoldResult> sizes) const { |
| FailureOr<TilingResult> tilingResult = |
| getTiledImplementation(op, b, offsets, sizes); |
| if (failed(tilingResult)) |
| return failure(); |
| return tilingResult.value(); |
| } |
| |
| /// Method to return the position of iteration domain tile computed by the |
| /// tiled operation. |
| LogicalResult getIterationDomainTileFromOperandTiles( |
| Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers, |
| ArrayRef<SmallVector<OpFoldResult>> allOffsets, |
| ArrayRef<SmallVector<OpFoldResult>> allSizes, |
| SmallVectorImpl<OpFoldResult> &resultOffsets, |
| SmallVectorImpl<OpFoldResult> &resultSizes) const { |
| if (operandNumbers.size() != 1) { |
| LLVM_DEBUG({ llvm::dbgs() << "unable to handle multiple operands"; }); |
| return failure(); |
| } |
| auto unPackOp = cast<UnPackOp>(op); |
| unsigned operandNumber = operandNumbers[0]; |
| ArrayRef<OpFoldResult> offsets(allOffsets[0]); |
| ArrayRef<OpFoldResult> sizes(allSizes[0]); |
| |
| // If the operand tile is the dest, then no adjustment is needed. |
| if (operandNumber == unPackOp.getDestMutable().getOperandNumber()) { |
| resultOffsets = llvm::to_vector(offsets); |
| resultSizes = llvm::to_vector(sizes); |
| return success(); |
| } |
| Location loc = unPackOp.getLoc(); |
| |
| int64_t numTiles = unPackOp.getInnerDimsPos().size(); |
| auto destOffsets = offsets.drop_back(numTiles); |
| auto destSizes = sizes.drop_back(numTiles); |
| // The tiling is applied on interchanged dimensions. We have to undo the |
| // interchange to map sizes and offsets to the original input. |
| int64_t outputRank = unPackOp.getDestRank(); |
| ReifiedRankedShapedTypeDims reifiedReturnShapes; |
| if (failed(reifyResultShapes(b, unPackOp, reifiedReturnShapes))) |
| return failure(); |
| SmallVector<OpFoldResult> outputMixedSizes = reifiedReturnShapes.front(); |
| SmallVector<OpFoldResult> origOffsets(destOffsets); |
| SmallVector<OpFoldResult> origSizes(destSizes); |
| applyPermToRange(origOffsets, origSizes, |
| invertPermutationVector(unPackOp.getOuterDimsPerm())); |
| |
| DenseMap<int64_t, OpFoldResult> dimAndTileMapping = |
| unPackOp.getDimAndTileMapping(); |
| |
| for (auto dim : llvm::seq<int64_t>(0, outputRank)) { |
| using AV = affine::AffineValueExpr; |
| affine::AffineBuilder ab(b, loc); |
| AffineExpr dim0, dim1, sym0; |
| bindDims(b.getContext(), dim0, dim1); |
| bindSymbols(b.getContext(), sym0); |
| if (dimAndTileMapping.count(dim)) { |
| // If the data dimension is tiled, the i-th index is the product of |
| // offset_i and tile_i, and the i-th size is the product of sizes_i and |
| // tile_i. The sizes must be clamped to the sizes of the unpack result. |
| auto avOffset = AV(dim0).bind(origOffsets[dim]); |
| auto avSize = AV(dim0).bind(origSizes[dim]); |
| auto avTileSize = AV(sym0).bind(dimAndTileMapping[dim]); |
| auto avResultSize = AV(dim0).bind(outputMixedSizes[dim]); |
| resultOffsets.push_back(ab.mul(avOffset, avTileSize)); |
| auto avResultOffset = AV(dim1).bind(resultOffsets.back()); |
| resultSizes.push_back(ab.min({ab.mul(avSize, avTileSize), |
| ab.sub(avResultSize, avResultOffset)})); |
| } else { |
| resultOffsets.push_back(origOffsets[dim]); |
| resultSizes.push_back(origSizes[dim]); |
| } |
| } |
| return success(); |
| } |
| |
| /// Method to return the tiled implementation of tensor.unpack as a consumer. |
| FailureOr<TilingResult> getTiledImplementationFromOperandTiles( |
| Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers, |
| ArrayRef<SmallVector<OpFoldResult>> allOffsets, |
| ArrayRef<SmallVector<OpFoldResult>> allSizes) const { |
| if (operandNumbers.size() != 1 || operandNumbers[0] != 0) { |
| LLVM_DEBUG({ llvm::dbgs() << "unhandled operands for consumer fusion"; }); |
| return failure(); |
| } |
| auto unPackOp = cast<UnPackOp>(op); |
| ArrayRef<OpFoldResult> offsets(allOffsets[0]); |
| ArrayRef<OpFoldResult> sizes(allSizes[0]); |
| |
| // tensor.unpack op is fusible (as a consumer) only if inner dims are not |
| // tiled. |
| int64_t numTiles = unPackOp.getInnerDimsPos().size(); |
| for (auto iter : |
| llvm::zip_equal(unPackOp.getMixedTiles(), sizes.take_back(numTiles))) { |
| if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter))) |
| return failure(); |
| } |
| |
| Location loc = unPackOp.getLoc(); |
| |
| // Fetch offset/size for creating the slice of the dest operand of |
| // unpack op. |
| SmallVector<OpFoldResult> outputOffsets, outputSizes; |
| if (failed(getIterationDomainTileFromOperandTiles( |
| op, b, operandNumbers, allOffsets, allSizes, outputOffsets, |
| outputSizes))) |
| return failure(); |
| |
| auto oneAttr = b.getI64IntegerAttr(1); |
| int64_t outputRank = unPackOp.getDestRank(); |
| SmallVector<OpFoldResult> strides(outputRank, oneAttr); |
| |
| SmallVector<Value> tiledOperands; |
| // Create slice of the dest operand. |
| auto extractDestSlice = tensor::ExtractSliceOp::create( |
| b, loc, unPackOp.getDest(), outputOffsets, outputSizes, strides); |
| tiledOperands.push_back(extractDestSlice); |
| |
| strides.append(unPackOp.getSourceRank() - outputRank, oneAttr); |
| // Create slice of the source operand. |
| auto extractSourceSlice = tensor::ExtractSliceOp::create( |
| b, loc, unPackOp.getSource(), offsets, sizes, strides); |
| tiledOperands.insert(tiledOperands.begin(), extractSourceSlice); |
| for (auto tile : unPackOp.getInnerTiles()) |
| tiledOperands.push_back(tile); |
| |
| // Create tiled unpack op. |
| Operation *tiledUnPackOp = |
| UnPackOp::create(b, loc, TypeRange{extractDestSlice.getType()}, |
| tiledOperands, op->getAttrs()); |
| |
| return TilingResult{{tiledUnPackOp}, |
| SmallVector<Value>(tiledUnPackOp->getResults()), |
| llvm::to_vector(ArrayRef<Operation *>{ |
| extractSourceSlice, extractDestSlice})}; |
| } |
| }; |
| |
| } // namespace |
| |
| template <typename OpType> |
| static void registerOne(MLIRContext *ctx) { |
| OpType::template attachInterface<LinalgOpTilingInterface<OpType>>(*ctx); |
| OpType::template attachInterface<LinalgOpPartialReductionInterface<OpType>>( |
| *ctx); |
| } |
| |
| /// Variadic helper function. |
| template <typename... OpTypes> |
| static void registerAll(MLIRContext *ctx) { |
| (registerOne<OpTypes>(ctx), ...); |
| } |
| |
| #define GET_OP_LIST |
| |
| void mlir::linalg::registerTilingInterfaceExternalModels( |
| DialectRegistry ®istry) { |
| registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *dialect) { |
| registerOne<linalg::GenericOp>(ctx); |
| linalg::PackOp::attachInterface<PackOpTiling>(*ctx); |
| linalg::UnPackOp::attachInterface<UnPackOpTiling>(*ctx); |
| registerAll< |
| #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc" |
| >(ctx); |
| }); |
| } |
| |
| void mlir::linalg::registerTilingInterfaceExternalModelsForPackUnPackOps( |
| DialectRegistry ®istry) { |
| registry.addExtension(+[](MLIRContext *ctx, LinalgDialect *dialect) { |
| linalg::PackOp::attachInterface<PackOpTiling>(*ctx); |
| linalg::UnPackOp::attachInterface<UnPackOpTiling>(*ctx); |
| }); |
| } |