lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp - llvm-project/mlir - Git at Google

 //===- TilingInterfaceImpl.cpp - Implementation of TilingInterface -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"

 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Interfaces/TilingInterface.h"

 using namespace mlir;
 using namespace mlir::linalg;

 //===----------------------------------------------------------------------===//
 // Utility methods for implementation of Tiling Interface for Linalg ops
 //===----------------------------------------------------------------------===//

 /// Return the SSA values that represent the data point accessed using a given
 /// `indexingMap` for a given point in the iteration space represented by `ivs`.
 static SmallVector<Value> getIndicesForAccess(OpBuilder &b, Location loc,
                                               AffineMap indexingMap,
                                               ValueRange ivs) {
   SmallVector<Value> indices;
   indices.reserve(indexingMap.getNumResults());
   for (auto result : indexingMap.getResults()) {
     AffineMap m = AffineMap::get(indexingMap.getNumDims(),
                                  indexingMap.getNumSymbols(), result);
     Value v = b.create<AffineApplyOp>(loc, m, ivs);
     indices.push_back(v);
   }
   return indices;
 }

 /// Method to inline the payload of a `linalgOp` given the iteration space
 /// point and values for the arguments of the payload.
 static LogicalResult inlinePayload(OpBuilder &b, LinalgOp linalgOp,
                                    ValueRange ivs, ValueRange argValues) {
   Block *body = linalgOp.getBlock();
   BlockAndValueMapping map;
   map.map(body->getArguments(), argValues);
   for (auto &op : body->without_terminator()) {
     if (auto indexOp = dyn_cast<IndexOp>(&op)) {
       map.map(indexOp.getResult(), ivs[indexOp.getDim()]);
       continue;
     }
     b.clone(op, map);
   }

   Operation *terminator = body->getTerminator();
   Location loc = terminator->getLoc();
   for (const auto &operand : llvm::enumerate(terminator->getOperands())) {
     Value toStore = map.lookupOrDefault(operand.value());
     OpOperand *storeInto = linalgOp.getDpsInitOperand(operand.index());
     auto indices = getIndicesForAccess(
         b, loc, linalgOp.getMatchingIndexingMap(storeInto), ivs);
     b.create<memref::StoreOp>(
         loc, toStore, linalgOp.getDpsInitOperand(operand.index())->get(),
         indices);
   }
   return success();
 }

 //===----------------------------------------------------------------------===//
 // External Model for implementing `TilingInterface` for `LinalgOp`s.
 //===----------------------------------------------------------------------===//

 namespace {
 /// External model implementation of TilingInterface for LinalgOps. An external
 /// model implementation is used for now till the use of `TilingInterface` is
 /// on-par with the current Linalg tiling + fusion patterns. Once it is
 /// maybe possible to move this into the op-definition (though there are
 /// advantages to leaving it as an external model)
 template <typename LinalgOpTy>
 struct LinalgOpTilingInterface
     : public TilingInterface::ExternalModel<LinalgOpTilingInterface<LinalgOpTy>,
                                             LinalgOpTy> {
   /// Return the loop iterator type.
   SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
     LinalgOpTy concreteOp = cast<LinalgOpTy>(op);
     return llvm::to_vector(llvm::map_range(
         concreteOp.getIteratorTypesArray(), [](StringRef iteratorType) {
           return utils::symbolizeIteratorType(iteratorType).value();
         }));
   }

   /// Return the iteration domain range.
   SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {
     OpBuilder::InsertionGuard g(b);
     b.setInsertionPoint(op);
     Location loc = op->getLoc();
     LinalgOp linalgOp = cast<LinalgOp>(op);
     SmallVector<OpFoldResult> allShapesSizes =
         linalgOp.createFlatListOfOperandDims(b, loc);
     AffineMap map = linalgOp.getShapesToLoopsMap();

     return llvm::to_vector(
         llvm::map_range(map.getResults(), [&](AffineExpr loopExpr) {
           OpFoldResult ofr =
               makeComposedFoldedAffineApply(b, loc, loopExpr, allShapesSizes);
           return Range{b.getIndexAttr(0), ofr, b.getIndexAttr(1)};
         }));
   }

   // Instantiate the tiled implementation of the operation.
   SmallVector<Operation *>
   getTiledImplementation(Operation *op, OpBuilder &b,
                          ArrayRef<OpFoldResult> offsets,
                          ArrayRef<OpFoldResult> sizes) const {
     // Leave the `sizeBounds` value empty. That is only needed when the `sizes`
     // specified could lead to out of bounds accesses.
     Location loc = op->getLoc();
     LinalgOp linalgOp = cast<LinalgOp>(op);
     SmallVector<Value> valuesToTile = linalgOp->getOperands();
     SmallVector<Value, 4> tiledOperands = makeTiledShapes(
         b, loc, linalgOp, valuesToTile, offsets, sizes, {}, true);

     SmallVector<Type> resultTensorTypes =
         getTensorOutputTypes(linalgOp, tiledOperands);

     Operation *tiledOp =
         linalgOp.clone(b, loc, resultTensorTypes, tiledOperands);
     offsetIndices(b, cast<LinalgOp>(tiledOp), offsets);

     return {tiledOp};
   }

   // Return the details of the output tile generated by the tiled
   // implementation.
   LogicalResult
   getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber,
                         ArrayRef<OpFoldResult> offsets,
                         ArrayRef<OpFoldResult> sizes,
                         SmallVector<OpFoldResult> &resultOffsets,
                         SmallVector<OpFoldResult> &resultSizes) const {
     Location loc = op->getLoc();
     LinalgOp linalgOp = cast<LinalgOp>(op);

     AffineExpr d0;
     bindDims(b.getContext(), d0);
     SmallVector<OpFoldResult> subShapeSizes =
         llvm::to_vector(llvm::map_range(sizes, [&](OpFoldResult ofr) {
           return makeComposedFoldedAffineApply(b, loc, d0 - 1, ofr);
         }));

     OpOperand *outOperand = linalgOp.getDpsInitOperand(resultNumber);
     SliceParameters sliceParams = computeSliceParameters(
         b, loc, outOperand->get(), sizes,
         linalgOp.getMatchingIndexingMap(outOperand), offsets,
         /*ubs*/ {}, subShapeSizes, true);
     resultOffsets = sliceParams.offsets;
     resultSizes = sliceParams.sizes;
     return success();
   }

   FailureOr<Value> generateResultTileValue(Operation *op, OpBuilder &b,
                                            unsigned resultNumber,
                                            ArrayRef<OpFoldResult> offsets,
                                            ArrayRef<OpFoldResult> sizes) const {
     auto linalgOp = cast<LinalgOp>(op);

     // Check that the indexing map used for the output is a projected
     // permutation. This could be relaxed with a more general approach that can
     // map the offsets and sizes from the result to iteration space tiles
     // (filling in full extent for dimensions not used to access the result).
     AffineMap indexingMap =
         linalgOp.getIndexingMapMatchingResult(op->getResult(resultNumber));
     if (!indexingMap.isProjectedPermutation()) {
       return op->emitOpError(
           "unhandled tiled implementation generation when result is not "
           "accessed using a permuted projection");
     }

     auto numLoops = linalgOp.getNumLoops();
     auto tilingInterfaceOp = cast<TilingInterface>(op);
     SmallVector<OpFoldResult> iterationTileOffsets(numLoops),
         iterationTileSizes(numLoops);
     if (!indexingMap.isPermutation()) {
       SmallVector<Range> iterationDomain =
           tilingInterfaceOp.getIterationDomain(b);
       for (const auto &range : llvm::enumerate(iterationDomain)) {
         iterationTileOffsets[range.index()] = range.value().offset;
         iterationTileSizes[range.index()] = range.value().size;
       }
     }
     for (const auto &resultExpr : llvm::enumerate(indexingMap.getResults())) {
       unsigned dimPosition =
           resultExpr.value().cast<AffineDimExpr>().getPosition();
       iterationTileOffsets[dimPosition] = offsets[resultExpr.index()];
       iterationTileSizes[dimPosition] = sizes[resultExpr.index()];
     }

     SmallVector<Operation *> tiledOp = tilingInterfaceOp.getTiledImplementation(
         b, iterationTileOffsets, iterationTileSizes);
     if (tiledOp.size() != 1)
       return op->emitOpError("failed to generate tiled implementation");

     return tiledOp[0]->getResult(resultNumber);
   }

   LogicalResult generateScalarImplementation(Operation *op, OpBuilder &builder,
                                              Location loc,
                                              ValueRange ivs) const {
     auto linalgOp = cast<LinalgOp>(op);
     if (!linalgOp.hasBufferSemantics())
       return op->emitOpError("expected operation to have buffer semantics");

     SmallVector<Value> indexedValues;
     indexedValues.reserve(linalgOp->getNumOperands());
     Location linalgOpLoc = op->getLoc();
     /// Load the data corresponding to the block arguments that
     /// represent input operands.
     for (OpOperand &operand : linalgOp->getOpOperands()) {
       if (!linalgOp.payloadUsesValueFromOperand(&operand)) {
         indexedValues.push_back(nullptr);
         continue;
       }
       if (linalgOp.isScalar(&operand)) {
         indexedValues.push_back(operand.get());
         continue;
       }
       SmallVector<Value> indices = getIndicesForAccess(
           builder, linalgOpLoc, linalgOp.getMatchingIndexingMap(&operand), ivs);
       Value load =
           builder.create<memref::LoadOp>(linalgOpLoc, operand.get(), indices);
       indexedValues.push_back(load);
     }

     /// Inline the op payload and store the result.
     return inlinePayload(builder, linalgOp, ivs, indexedValues);
   }
 };

 //===----------------------------------------------------------------------===//
 // External Model for implementing `PartialReductionInterface` for `LinalgOp`s.
 //===----------------------------------------------------------------------===//

 /// External model implementation of PartialReductionInterface for LinalgOps.
 template <typename LinalgOpTy>
 struct LinalgOpPartialReductionInterface
     : public PartialReductionOpInterface::ExternalModel<
           LinalgOpPartialReductionInterface<LinalgOpTy>, LinalgOpTy> {
   FailureOr<Operation *> generateInitialTensorForPartialReduction(
       Operation *op, OpBuilder &b, Location loc, ArrayRef<OpFoldResult> sizes,
       ArrayRef<int> reductionDims) const {
     auto linalgOp = cast<LinalgOp>(op);
     OpBuilder::InsertionGuard guard(b);
     assert(reductionDims.size() == 1 &&
            "only support single reduction right now.");
     if (linalgOp.hasBufferSemantics())
       return op->emitOpError("expected operation to have tensor semantics");
     // Insert the new parallel dimension based on the index of the reduction
     // loop. This could be controlled by user for more flexibility.
     int64_t insertSplitDimension = reductionDims[0];

     SmallVector<Operation *, 4> combinerOps;
     if (!matchReduction(linalgOp.getRegionOutputArgs(), 0, combinerOps) ||
         combinerOps.size() != 1)
       return op->emitOpError("Failed to anaysis the reduction operation.");

     Operation *reductionOp = combinerOps[0];
     Optional<Attribute> identity = getNeutralElement(reductionOp);
     if (!identity.has_value())
       return op->emitOpError(
           "Failed to get an identity value for the reduction operation.");

     // Calculate the new shape, we insert the new dimension based on the index
     // of the reduction dimension.
     SmallVector<int64_t> newOutputShape;
     ArrayRef<int64_t> oldShape =
         linalgOp.getShape(linalgOp.getDpsInitOperand(0));
     SmallVector<Value> dynamicDims;
     for (int64_t idx : llvm::seq<int64_t>(0, oldShape.size() + 1)) {
       if (idx == insertSplitDimension) {
         dispatchIndexOpFoldResults(sizes[idx], dynamicDims, newOutputShape,
                                    ShapedType::kDynamicStrideOrOffset);
         continue;
       }
       int64_t oldIdx = idx < insertSplitDimension ? idx : idx - 1;
       int64_t dim = oldShape[oldIdx];
       newOutputShape.push_back(dim);
       if (ShapedType::isDynamic(dim))
         dynamicDims.push_back(b.createOrFold<tensor::DimOp>(
             loc, linalgOp.getDpsInitOperand(0)->get(), oldIdx));
     }
     Value emptyTensor = b.create<tensor::EmptyOp>(
         loc, newOutputShape, linalgOp.getRegionOutputArgs()[0].getType(),
         dynamicDims);
     Value constantOp = b.create<arith::ConstantOp>(loc, *identity);
     auto identityTensor =
         b.create<linalg::FillOp>(loc, constantOp, emptyTensor);
     return identityTensor.getOperation();
   }

   Operation *tileToPartialReduction(Operation *op, OpBuilder &b, Location loc,
                                     ValueRange init,
                                     ArrayRef<OpFoldResult> offsets,
                                     ArrayRef<OpFoldResult> sizes,
                                     ArrayRef<int> reductionDims) const {
     OpBuilder::InsertionGuard guard(b);
     auto linalgOp = cast<LinalgOp>(op);
     assert(reductionDims.size() == 1 &&
            "only support single reduction right now.");
     int64_t insertSplitDimension = reductionDims[0];

     AffineMap oldOutputMap =
         linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(0));
     SmallVector<AffineExpr> outputExpr;
     for (auto &[idx, expr] : llvm::enumerate(oldOutputMap.getResults())) {
       if (static_cast<int64_t>(idx) == insertSplitDimension) {
         outputExpr.push_back(b.getAffineDimExpr(reductionDims[0]));
       }
       outputExpr.push_back(expr);
     }
     if (insertSplitDimension == oldOutputMap.getNumResults())
       outputExpr.push_back(b.getAffineDimExpr(reductionDims[0]));

     // Step 1: Extract a slice of the input operands.
     SmallVector<Value> valuesToTile = linalgOp.getDpsInputOperands();
     SmallVector<Value, 4> tiledOperands =
         makeTiledShapes(b, loc, op, valuesToTile, offsets, sizes, {}, true);

     // Step 2: Extract the accumulator operands
     SmallVector<OpFoldResult> strides(offsets.size(), b.getIndexAttr(1));
     SmallVector<OpFoldResult> outOffsets(offsets.size(), b.getIndexAttr(0));
     // TODO: use SubsetExtractOpInterface once it is available.
     Value out = b.create<tensor::ExtractSliceOp>(loc, init[0], outOffsets,
                                                  sizes, strides);

     // Step3. create a generic op where the reduction dimension is replaced by a
     // parallel dimension of the size of reduction.
     SmallVector<StringRef> newIteratorTypes = linalgOp.getIteratorTypesArray();
     newIteratorTypes[reductionDims[0]] = getParallelIteratorTypeName();
     SmallVector<AffineMap> newMaps = linalgOp.getIndexingMapsArray();
     newMaps.back() = AffineMap::get(newMaps.back().getNumDims(), 0, outputExpr,
                                     linalgOp.getContext());
     auto genericOp =
         b.create<GenericOp>(loc, TypeRange({out.getType()}), tiledOperands,
                             ValueRange({out}), newMaps, newIteratorTypes);
     BlockAndValueMapping mapping;
     op->getRegion(0).cloneInto(&genericOp.getRegion(),
                                genericOp.getRegion().begin(), mapping);
     return genericOp.getOperation();
   }

   Operation *mergeReductions(Operation *op, OpBuilder &b, Location loc,
                              ValueRange partialReduce,
                              ArrayRef<int> reductionDims) const {
     auto linalgOp = cast<LinalgOp>(op);
     assert(reductionDims.size() == 1 &&
            "only support single reduction right now.");
     int64_t dimToMerge = reductionDims[0];

     // Then create a new reduction that only reduce the newly added dimension
     // from the previous op.
     int64_t intermRank =
         partialReduce[0].getType().cast<ShapedType>().getRank();
     AffineMap inputMap = b.getMultiDimIdentityMap(intermRank);
     SmallVector<StringRef> reductionIteratorTypes;
     SmallVector<AffineExpr> exprs;
     for (int64_t i : llvm::seq<int64_t>(0, intermRank)) {
       if (dimToMerge == i) {
         reductionIteratorTypes.push_back(getReductionIteratorTypeName());
       } else {
         exprs.push_back(b.getAffineDimExpr(i));
         reductionIteratorTypes.push_back(getParallelIteratorTypeName());
       }
     }
     AffineMap outputMap =
         AffineMap::get(intermRank, 0, exprs, op->getContext());
     SmallVector<AffineMap> reductionMaps = {inputMap, outputMap};

     SmallVector<Operation *, 4> combinerOps;
     matchReduction(linalgOp.getRegionOutputArgs(), 0, combinerOps);
     Operation *reductionOp = combinerOps[0];

     auto reduction = b.create<GenericOp>(
         loc, op->getResultTypes(), ValueRange({partialReduce[0]}),
         SmallVector<Value>{linalgOp.getDpsInitOperands()}, reductionMaps,
         reductionIteratorTypes,
         [reductionOp](OpBuilder &b, Location loc, ValueRange inputs) {
           Operation *clonedReductionOp = b.clone(*reductionOp);
           clonedReductionOp->setOperand(0, inputs[0]);
           clonedReductionOp->setOperand(1, inputs[1]);
           b.create<linalg::YieldOp>(loc, clonedReductionOp->getResult(0));
         });
     return reduction.getOperation();
   }
 };

 } // namespace

 template <typename OpType>
 static void registerOne(MLIRContext *ctx) {
   OpType::template attachInterface<LinalgOpTilingInterface<OpType>>(*ctx);
   OpType::template attachInterface<LinalgOpPartialReductionInterface<OpType>>(
       *ctx);
 }

 /// Variadic helper function.
 template <typename... OpTypes>
 static void registerAll(MLIRContext *ctx) {
   (registerOne<OpTypes>(ctx), ...);
 }

 #define GET_OP_LIST

 void mlir::linalg::registerTilingInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *dialect) {
     registerOne<linalg::GenericOp>(ctx);
     registerAll<
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
         >(ctx);
   });
 }
	//===- TilingInterfaceImpl.cpp - Implementation of TilingInterface -------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"

	#include "mlir/Analysis/SliceAnalysis.h"
	#include "mlir/Dialect/Affine/IR/AffineOps.h"
	#include "mlir/Dialect/Arith/IR/Arith.h"
	#include "mlir/Dialect/Arith/Utils/Utils.h"
	#include "mlir/Dialect/Linalg/IR/Linalg.h"
	#include "mlir/Dialect/Linalg/Utils/Utils.h"
	#include "mlir/Dialect/MemRef/IR/MemRef.h"
	#include "mlir/Dialect/Tensor/IR/Tensor.h"
	#include "mlir/Dialect/Utils/StaticValueUtils.h"
	#include "mlir/Interfaces/TilingInterface.h"

	using namespace mlir;
	using namespace mlir::linalg;

	//===----------------------------------------------------------------------===//
	// Utility methods for implementation of Tiling Interface for Linalg ops
	//===----------------------------------------------------------------------===//

	/// Return the SSA values that represent the data point accessed using a given
	/// `indexingMap` for a given point in the iteration space represented by `ivs`.
	static SmallVector<Value> getIndicesForAccess(OpBuilder &b, Location loc,
	AffineMap indexingMap,
	ValueRange ivs) {
	SmallVector<Value> indices;
	indices.reserve(indexingMap.getNumResults());
	for (auto result : indexingMap.getResults()) {
	AffineMap m = AffineMap::get(indexingMap.getNumDims(),
	indexingMap.getNumSymbols(), result);
	Value v = b.create<AffineApplyOp>(loc, m, ivs);
	indices.push_back(v);
	}
	return indices;
	}

	/// Method to inline the payload of a `linalgOp` given the iteration space
	/// point and values for the arguments of the payload.
	static LogicalResult inlinePayload(OpBuilder &b, LinalgOp linalgOp,
	ValueRange ivs, ValueRange argValues) {
	Block *body = linalgOp.getBlock();
	BlockAndValueMapping map;
	map.map(body->getArguments(), argValues);
	for (auto &op : body->without_terminator()) {
	if (auto indexOp = dyn_cast<IndexOp>(&op)) {
	map.map(indexOp.getResult(), ivs[indexOp.getDim()]);
	continue;
	}
	b.clone(op, map);
	}

	Operation *terminator = body->getTerminator();
	Location loc = terminator->getLoc();
	for (const auto &operand : llvm::enumerate(terminator->getOperands())) {
	Value toStore = map.lookupOrDefault(operand.value());
	OpOperand *storeInto = linalgOp.getDpsInitOperand(operand.index());
	auto indices = getIndicesForAccess(
	b, loc, linalgOp.getMatchingIndexingMap(storeInto), ivs);
	b.create<memref::StoreOp>(
	loc, toStore, linalgOp.getDpsInitOperand(operand.index())->get(),
	indices);
	}
	return success();
	}

	//===----------------------------------------------------------------------===//
	// External Model for implementing `TilingInterface` for `LinalgOp`s.
	//===----------------------------------------------------------------------===//

	namespace {
	/// External model implementation of TilingInterface for LinalgOps. An external
	/// model implementation is used for now till the use of `TilingInterface` is
	/// on-par with the current Linalg tiling + fusion patterns. Once it is
	/// maybe possible to move this into the op-definition (though there are
	/// advantages to leaving it as an external model)
	template <typename LinalgOpTy>
	struct LinalgOpTilingInterface
	: public TilingInterface::ExternalModel<LinalgOpTilingInterface<LinalgOpTy>,
	LinalgOpTy> {
	/// Return the loop iterator type.
	SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
	LinalgOpTy concreteOp = cast<LinalgOpTy>(op);
	return llvm::to_vector(llvm::map_range(
	concreteOp.getIteratorTypesArray(), [](StringRef iteratorType) {
	return utils::symbolizeIteratorType(iteratorType).value();
	}));
	}

	/// Return the iteration domain range.
	SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {
	OpBuilder::InsertionGuard g(b);
	b.setInsertionPoint(op);
	Location loc = op->getLoc();
	LinalgOp linalgOp = cast<LinalgOp>(op);
	SmallVector<OpFoldResult> allShapesSizes =
	linalgOp.createFlatListOfOperandDims(b, loc);
	AffineMap map = linalgOp.getShapesToLoopsMap();

	return llvm::to_vector(
	llvm::map_range(map.getResults(), [&](AffineExpr loopExpr) {
	OpFoldResult ofr =
	makeComposedFoldedAffineApply(b, loc, loopExpr, allShapesSizes);
	return Range{b.getIndexAttr(0), ofr, b.getIndexAttr(1)};
	}));
	}

	// Instantiate the tiled implementation of the operation.
	SmallVector<Operation *>
	getTiledImplementation(Operation *op, OpBuilder &b,
	ArrayRef<OpFoldResult> offsets,
	ArrayRef<OpFoldResult> sizes) const {
	// Leave the `sizeBounds` value empty. That is only needed when the `sizes`
	// specified could lead to out of bounds accesses.
	Location loc = op->getLoc();
	LinalgOp linalgOp = cast<LinalgOp>(op);
	SmallVector<Value> valuesToTile = linalgOp->getOperands();
	SmallVector<Value, 4> tiledOperands = makeTiledShapes(
	b, loc, linalgOp, valuesToTile, offsets, sizes, {}, true);

	SmallVector<Type> resultTensorTypes =
	getTensorOutputTypes(linalgOp, tiledOperands);

	Operation *tiledOp =
	linalgOp.clone(b, loc, resultTensorTypes, tiledOperands);
	offsetIndices(b, cast<LinalgOp>(tiledOp), offsets);

	return {tiledOp};
	}

	// Return the details of the output tile generated by the tiled
	// implementation.
	LogicalResult
	getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber,
	ArrayRef<OpFoldResult> offsets,
	ArrayRef<OpFoldResult> sizes,
	SmallVector<OpFoldResult> &resultOffsets,
	SmallVector<OpFoldResult> &resultSizes) const {
	Location loc = op->getLoc();
	LinalgOp linalgOp = cast<LinalgOp>(op);

	AffineExpr d0;
	bindDims(b.getContext(), d0);
	SmallVector<OpFoldResult> subShapeSizes =
	llvm::to_vector(llvm::map_range(sizes, [&](OpFoldResult ofr) {
	return makeComposedFoldedAffineApply(b, loc, d0 - 1, ofr);
	}));

	OpOperand *outOperand = linalgOp.getDpsInitOperand(resultNumber);
	SliceParameters sliceParams = computeSliceParameters(
	b, loc, outOperand->get(), sizes,
	linalgOp.getMatchingIndexingMap(outOperand), offsets,
	/ubs/ {}, subShapeSizes, true);
	resultOffsets = sliceParams.offsets;
	resultSizes = sliceParams.sizes;
	return success();
	}

	FailureOr<Value> generateResultTileValue(Operation *op, OpBuilder &b,
	unsigned resultNumber,
	ArrayRef<OpFoldResult> offsets,
	ArrayRef<OpFoldResult> sizes) const {
	auto linalgOp = cast<LinalgOp>(op);

	// Check that the indexing map used for the output is a projected
	// permutation. This could be relaxed with a more general approach that can
	// map the offsets and sizes from the result to iteration space tiles
	// (filling in full extent for dimensions not used to access the result).
	AffineMap indexingMap =
	linalgOp.getIndexingMapMatchingResult(op->getResult(resultNumber));
	if (!indexingMap.isProjectedPermutation()) {
	return op->emitOpError(
	"unhandled tiled implementation generation when result is not "
	"accessed using a permuted projection");
	}

	auto numLoops = linalgOp.getNumLoops();
	auto tilingInterfaceOp = cast<TilingInterface>(op);
	SmallVector<OpFoldResult> iterationTileOffsets(numLoops),
	iterationTileSizes(numLoops);
	if (!indexingMap.isPermutation()) {
	SmallVector<Range> iterationDomain =
	tilingInterfaceOp.getIterationDomain(b);
	for (const auto &range : llvm::enumerate(iterationDomain)) {
	iterationTileOffsets[range.index()] = range.value().offset;
	iterationTileSizes[range.index()] = range.value().size;
	}
	}
	for (const auto &resultExpr : llvm::enumerate(indexingMap.getResults())) {
	unsigned dimPosition =
	resultExpr.value().cast<AffineDimExpr>().getPosition();
	iterationTileOffsets[dimPosition] = offsets[resultExpr.index()];
	iterationTileSizes[dimPosition] = sizes[resultExpr.index()];
	}

	SmallVector<Operation *> tiledOp = tilingInterfaceOp.getTiledImplementation(
	b, iterationTileOffsets, iterationTileSizes);
	if (tiledOp.size() != 1)
	return op->emitOpError("failed to generate tiled implementation");

	return tiledOp[0]->getResult(resultNumber);
	}

	LogicalResult generateScalarImplementation(Operation *op, OpBuilder &builder,
	Location loc,
	ValueRange ivs) const {
	auto linalgOp = cast<LinalgOp>(op);
	if (!linalgOp.hasBufferSemantics())
	return op->emitOpError("expected operation to have buffer semantics");

	SmallVector<Value> indexedValues;
	indexedValues.reserve(linalgOp->getNumOperands());
	Location linalgOpLoc = op->getLoc();
	/// Load the data corresponding to the block arguments that
	/// represent input operands.
	for (OpOperand &operand : linalgOp->getOpOperands()) {
	if (!linalgOp.payloadUsesValueFromOperand(&operand)) {
	indexedValues.push_back(nullptr);
	continue;
	}
	if (linalgOp.isScalar(&operand)) {
	indexedValues.push_back(operand.get());
	continue;
	}
	SmallVector<Value> indices = getIndicesForAccess(
	builder, linalgOpLoc, linalgOp.getMatchingIndexingMap(&operand), ivs);
	Value load =
	builder.create<memref::LoadOp>(linalgOpLoc, operand.get(), indices);
	indexedValues.push_back(load);
	}

	/// Inline the op payload and store the result.
	return inlinePayload(builder, linalgOp, ivs, indexedValues);
	}
	};

	//===----------------------------------------------------------------------===//
	// External Model for implementing `PartialReductionInterface` for `LinalgOp`s.
	//===----------------------------------------------------------------------===//

	/// External model implementation of PartialReductionInterface for LinalgOps.
	template <typename LinalgOpTy>
	struct LinalgOpPartialReductionInterface
	: public PartialReductionOpInterface::ExternalModel<
	LinalgOpPartialReductionInterface<LinalgOpTy>, LinalgOpTy> {
	FailureOr<Operation *> generateInitialTensorForPartialReduction(
	Operation *op, OpBuilder &b, Location loc, ArrayRef<OpFoldResult> sizes,
	ArrayRef<int> reductionDims) const {
	auto linalgOp = cast<LinalgOp>(op);
	OpBuilder::InsertionGuard guard(b);
	assert(reductionDims.size() == 1 &&
	"only support single reduction right now.");
	if (linalgOp.hasBufferSemantics())
	return op->emitOpError("expected operation to have tensor semantics");
	// Insert the new parallel dimension based on the index of the reduction
	// loop. This could be controlled by user for more flexibility.
	int64_t insertSplitDimension = reductionDims[0];

	SmallVector<Operation *, 4> combinerOps;
	if (!matchReduction(linalgOp.getRegionOutputArgs(), 0, combinerOps) \|\|
	combinerOps.size() != 1)
	return op->emitOpError("Failed to anaysis the reduction operation.");

	Operation *reductionOp = combinerOps[0];
	Optional<Attribute> identity = getNeutralElement(reductionOp);
	if (!identity.has_value())
	return op->emitOpError(
	"Failed to get an identity value for the reduction operation.");

	// Calculate the new shape, we insert the new dimension based on the index
	// of the reduction dimension.
	SmallVector<int64_t> newOutputShape;
	ArrayRef<int64_t> oldShape =
	linalgOp.getShape(linalgOp.getDpsInitOperand(0));
	SmallVector<Value> dynamicDims;
	for (int64_t idx : llvm::seq<int64_t>(0, oldShape.size() + 1)) {
	if (idx == insertSplitDimension) {
	dispatchIndexOpFoldResults(sizes[idx], dynamicDims, newOutputShape,
	ShapedType::kDynamicStrideOrOffset);
	continue;
	}
	int64_t oldIdx = idx < insertSplitDimension ? idx : idx - 1;
	int64_t dim = oldShape[oldIdx];
	newOutputShape.push_back(dim);
	if (ShapedType::isDynamic(dim))
	dynamicDims.push_back(b.createOrFold<tensor::DimOp>(
	loc, linalgOp.getDpsInitOperand(0)->get(), oldIdx));
	}
	Value emptyTensor = b.create<tensor::EmptyOp>(
	loc, newOutputShape, linalgOp.getRegionOutputArgs()[0].getType(),
	dynamicDims);
	Value constantOp = b.create<arith::ConstantOp>(loc, *identity);
	auto identityTensor =
	b.create<linalg::FillOp>(loc, constantOp, emptyTensor);
	return identityTensor.getOperation();
	}

	Operation tileToPartialReduction(Operation op, OpBuilder &b, Location loc,
	ValueRange init,
	ArrayRef<OpFoldResult> offsets,
	ArrayRef<OpFoldResult> sizes,
	ArrayRef<int> reductionDims) const {
	OpBuilder::InsertionGuard guard(b);
	auto linalgOp = cast<LinalgOp>(op);
	assert(reductionDims.size() == 1 &&
	"only support single reduction right now.");
	int64_t insertSplitDimension = reductionDims[0];

	AffineMap oldOutputMap =
	linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(0));
	SmallVector<AffineExpr> outputExpr;
	for (auto &[idx, expr] : llvm::enumerate(oldOutputMap.getResults())) {
	if (static_cast<int64_t>(idx) == insertSplitDimension) {
	outputExpr.push_back(b.getAffineDimExpr(reductionDims[0]));
	}
	outputExpr.push_back(expr);
	}
	if (insertSplitDimension == oldOutputMap.getNumResults())
	outputExpr.push_back(b.getAffineDimExpr(reductionDims[0]));

	// Step 1: Extract a slice of the input operands.
	SmallVector<Value> valuesToTile = linalgOp.getDpsInputOperands();
	SmallVector<Value, 4> tiledOperands =
	makeTiledShapes(b, loc, op, valuesToTile, offsets, sizes, {}, true);

	// Step 2: Extract the accumulator operands
	SmallVector<OpFoldResult> strides(offsets.size(), b.getIndexAttr(1));
	SmallVector<OpFoldResult> outOffsets(offsets.size(), b.getIndexAttr(0));
	// TODO: use SubsetExtractOpInterface once it is available.
	Value out = b.create<tensor::ExtractSliceOp>(loc, init[0], outOffsets,
	sizes, strides);

	// Step3. create a generic op where the reduction dimension is replaced by a
	// parallel dimension of the size of reduction.
	SmallVector<StringRef> newIteratorTypes = linalgOp.getIteratorTypesArray();
	newIteratorTypes[reductionDims[0]] = getParallelIteratorTypeName();
	SmallVector<AffineMap> newMaps = linalgOp.getIndexingMapsArray();
	newMaps.back() = AffineMap::get(newMaps.back().getNumDims(), 0, outputExpr,
	linalgOp.getContext());
	auto genericOp =
	b.create<GenericOp>(loc, TypeRange({out.getType()}), tiledOperands,
	ValueRange({out}), newMaps, newIteratorTypes);
	BlockAndValueMapping mapping;
	op->getRegion(0).cloneInto(&genericOp.getRegion(),
	genericOp.getRegion().begin(), mapping);
	return genericOp.getOperation();
	}

	Operation mergeReductions(Operation op, OpBuilder &b, Location loc,
	ValueRange partialReduce,
	ArrayRef<int> reductionDims) const {
	auto linalgOp = cast<LinalgOp>(op);
	assert(reductionDims.size() == 1 &&
	"only support single reduction right now.");
	int64_t dimToMerge = reductionDims[0];

	// Then create a new reduction that only reduce the newly added dimension
	// from the previous op.
	int64_t intermRank =
	partialReduce[0].getType().cast<ShapedType>().getRank();
	AffineMap inputMap = b.getMultiDimIdentityMap(intermRank);
	SmallVector<StringRef> reductionIteratorTypes;
	SmallVector<AffineExpr> exprs;
	for (int64_t i : llvm::seq<int64_t>(0, intermRank)) {
	if (dimToMerge == i) {
	reductionIteratorTypes.push_back(getReductionIteratorTypeName());
	} else {
	exprs.push_back(b.getAffineDimExpr(i));
	reductionIteratorTypes.push_back(getParallelIteratorTypeName());
	}
	}
	AffineMap outputMap =
	AffineMap::get(intermRank, 0, exprs, op->getContext());
	SmallVector<AffineMap> reductionMaps = {inputMap, outputMap};

	SmallVector<Operation *, 4> combinerOps;
	matchReduction(linalgOp.getRegionOutputArgs(), 0, combinerOps);
	Operation *reductionOp = combinerOps[0];

	auto reduction = b.create<GenericOp>(
	loc, op->getResultTypes(), ValueRange({partialReduce[0]}),
	SmallVector<Value>{linalgOp.getDpsInitOperands()}, reductionMaps,
	reductionIteratorTypes,
	[reductionOp](OpBuilder &b, Location loc, ValueRange inputs) {
	Operation clonedReductionOp = b.clone(reductionOp);
	clonedReductionOp->setOperand(0, inputs[0]);
	clonedReductionOp->setOperand(1, inputs[1]);
	b.create<linalg::YieldOp>(loc, clonedReductionOp->getResult(0));
	});
	return reduction.getOperation();
	}
	};

	} // namespace

	template <typename OpType>
	static void registerOne(MLIRContext *ctx) {
	OpType::template attachInterface<LinalgOpTilingInterface<OpType>>(*ctx);
	OpType::template attachInterface<LinalgOpPartialReductionInterface<OpType>>(
	*ctx);
	}

	/// Variadic helper function.
	template <typename... OpTypes>
	static void registerAll(MLIRContext *ctx) {
	(registerOne<OpTypes>(ctx), ...);
	}

	#define GET_OP_LIST

	void mlir::linalg::registerTilingInterfaceExternalModels(
	DialectRegistry &registry) {
	registry.addExtension(+[](MLIRContext ctx, linalg::LinalgDialect dialect) {
	registerOne<linalg::GenericOp>(ctx);
	registerAll<
	#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
	>(ctx);
	});
	}