lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp - llvm-project/mlir - Git at Google

 //===- DecomposeGenericByUnfoldingPermutation.cpp                   -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include <map>
 #include <utility>

 using namespace mlir;
 using namespace mlir::linalg;

 namespace {

 /// This pattern decomposes the input operand(s) of a linalg.generic that has
 /// a `transpose`, `broadcast`, or a mixture of two, into explicit transpose
 /// and broadcast. Having them folded into the linalg.generic is a good
 /// optimization but sometimes we may want to unwrap, i.e., `unfold` them as
 /// explicit transpose and broadcast. This rewrite pattern helps do it for
 /// each input operand. This is useful for instance when trying to recognize
 /// named ops.
 ///
 /// The transpose, broadcast, or mixture of both, are expressed in the affine
 /// map of the operand. Technically it is essentially `projected permutation`.
 ///
 ///  Example
 ///
 /// ```mlir
 ///
 /// #projection = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)>
 /// #identity   = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
 /// ...
 ///    %res = linalg.generic
 ///       { indexing_maps = [#projection, #identity, #identity],
 ///       iterator_types = ["parallel", "parallel", "parallel",
 ///                         "parallel", "parallel"]}
 ///       ins(%x, %y : tensor<7x8x9xf32>, tensor<5x9x7x8x10xf32>)
 ///       outs(%z : tensor<5x9x7x8x10xf32>) {
 ///         ^bb0(%in: f32, %in_1: f32, %out: f32):
 ///              %div = arith.divf %in, %in_1 : f32
 ///              linalg.yield %div : f32
 ///    } -> tensor<5x9x7x8x10xf32>
 /// ```
 ///
 /// In the above IR operand `%x` map is a projected-permutation. This can be
 /// unfolded as:
 ///
 /// ```mlir
 ///   ...
 ///   %x_trans = linalg.transpose
 ///                   ins(%x : tensor<7x8x9xf32>)
 ///                   outs(%e1 : tensor<9x7x8xf32>) permutation = [2, 0, 1]
 ///   ...
 ///   %x_trans_bc = linalg.broadcast
 ///                   ins(%x_trans : tensor<9x7x8xf32>)
 ///                   outs(%e2 : tensor<5x9x7x8x10xf32>) dimensions = [0, 4]
 ///   %2 = linalg.div
 ///           ins(%x_trans_bc, %y :
 ///                  tensor<5x9x7x8x10xf32>, tensor<5x9x7x8x10xf32>)
 ///           outs(%arg2 : tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32>
 ///
 /// Note that linalg.generic has been 'specialized' to linalg.div.
 ///
 /// To unfold it, it is more optimal to transpose first and then do the
 /// broadcast. However, if transpose is done first, the permutation map needs
 /// to be expressed in terms of reduced dimension as broadcast hasn't happened
 /// yet. Also, the broadcast dimensions in a linalg.generic come from other
 /// operands (those not broadcasted along that particular dimension). We work
 /// this out by computing the convex-polyhedron shape of the linalg.generic
 /// iteration space from shapes of all the operands, both inputs and outputs.
 ///
 struct DecomposeProjectedPermutation : public OpRewritePattern<GenericOp> {
   using OpRewritePattern<GenericOp>::OpRewritePattern;

   LogicalResult matchAndRewrite(GenericOp genericOp,
                                 PatternRewriter &rewriter) const override;
 };

 /// For the given `map`, determine what dimensions are transposed and what
 /// dimensions are broadcasted.
 /// Returns :
 ///   transpose-permutation, broadcast-dimensions` (empty if not needed)
 ///
 std::pair<SmallVector<int64_t>, SmallVector<int64_t>>
 computeTransposeBroadcast(AffineMap &map) {
   assert(map.isProjectedPermutation(false) && "not a projection");

   // As the map is a projection it likely operates on a smaller set of
   // dimensions as far as the transpose is concerned (rest are broadcast).
   int64_t minorSize = map.getNumResults();

   SmallVector<int64_t> minorResult;
   for (int64_t i = 0; i < minorSize; ++i) {
     auto expr = cast<AffineDimExpr>(map.getResults()[i]);
     minorResult.push_back(expr.getPosition());
   }

   // If dims are not monotonically increasing then transpose is present.
   SmallVector<int64_t> sortedResMap(minorResult);
   llvm::sort(sortedResMap);
   bool hasTranspose = !std::equal(minorResult.begin(), minorResult.end(),
                                   sortedResMap.begin(), sortedResMap.end());

   // Walk the sorted map result to determine which dimensions are broadcasted.
   SmallVector<int64_t> broadcast;
   for (int64_t i = 0, j = 0; i < map.getNumInputs(); ++i) {
     if (j < minorSize && sortedResMap[j] == i) {
       j++;
       continue;
     }
     broadcast.push_back(i);
   }

   SmallVector<int64_t> permutation;
   if (hasTranspose) {
     // Consider an operand `x : tensor<7x8x9>` of a genericOp that has
     // affine map `affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)>`
     // `x`s access is both transposed and broadcast. But when specifying
     // the `linalg.transpose(x : tensor<7x8x9>)` the dimensions need to be
     // specified as `affine_map<(d0,d1,d2) -> (d1, d2, d0)` instead of
     // refering to d3, d4. Therefore, re-base the transpose dimensions so
     // that they start from d0.
     permutation.resize(minorSize);
     std::map<int64_t, int64_t> minorMap;
     for (int64_t i = 0; i < minorSize; ++i)
       minorMap.insert({sortedResMap[i], i});

     // Re-map the dimensions.
     SmallVector<int64_t> remappedResult(minorSize);
     for (int64_t i = 0; i < minorSize; ++i)
       remappedResult[i] = minorMap[minorResult[i]];

     /// Calculate the permutation for the transpose.
     for (unsigned i = 0; i < minorSize; ++i) {
       permutation[remappedResult[i]] = i;
     }
   }
   return {permutation, broadcast};
 }

 LogicalResult DecomposeProjectedPermutation::matchAndRewrite(
     GenericOp op, PatternRewriter &rewriter) const {
   if (!op.hasPureTensorSemantics() || op.isSingleInputOutput() ||
       op.isSingleYieldOp() || !op.isAllParallelLoops())
     return failure();

   // If the map of an operand is not a `projected permutation` then
   // it cannot be decomposed to mere transpose and broadcast.
   // The requirement that all maps be `projected permutation` may be
   // over-restrictive but since we need to determine shape of the
   // iteration space as well, reject if any map violates assumption.
   for (auto &opOperand : op->getOpOperands()) {
     auto map = op.getMatchingIndexingMap(&opOperand);
     if (!map.isProjectedPermutation(false))
       return failure();
   }

   // Decomposing linalg.generic involves creating `tensor.empty`
   // which can have dynamic shapes but then we would have to work
   // out which operand can supply that runtime-value (tensor.dim).
   // Leaving it as a future TODO.
   if (llvm::any_of(op->getOpOperands(), [](OpOperand &oper) {
         auto opType = cast<RankedTensorType>(oper.get().getType());
         return ShapedType::isDynamicShape(opType.getShape());
       }))
     return failure();

   auto outputShape = op.getStaticLoopRanges();

   auto loc = op.getLoc();
   bool isChanged = false;
   SmallVector<Value> newInitValues = op.getDpsInputs();
   SmallVector<AffineMap> newMap = op.getIndexingMapsArray();

   // Walk over each input operand and unfold if it is transposed, broadcast
   // or mix of two via operand's affine-map.
   for (int64_t i = 0; i < op.getNumDpsInputs(); ++i) {
     auto &map = newMap[i];
     auto inputRTType = cast<RankedTensorType>(newInitValues[i].getType());
     auto elType = inputRTType.getElementType();

     /// Nothing to do if map is already an identity.
     if (map.isIdentity())
       continue;

     auto [permutation, broadcastedDims] = computeTransposeBroadcast(map);

     // Does it need transpose?
     if (!permutation.empty()) {
       /// linalg.transpose permutes the dimensions of input using
       /// rule: dim(result, i) = dim(input, permutation[i])
       SmallVector<int64_t> transposedShape(map.getNumResults());
       for (int64_t i = 0; i < map.getNumResults(); ++i)
         transposedShape[i] = inputRTType.getShape()[permutation[i]];

       Value emptyTensor =
           tensor::EmptyOp::create(rewriter, loc, transposedShape, elType);

       auto transposeOp = TransposeOp::create(rewriter, loc, newInitValues[i],
                                              emptyTensor, permutation);
       newInitValues[i] = transposeOp->getResult(0);
       isChanged = true;
     }

     // Does it require broadcast?
     if (!broadcastedDims.empty()) {
       assert(!broadcastedDims.empty() && "should have non size broadcast");
       Value emptyTensor = tensor::EmptyOp::create(rewriter, loc, outputShape,
                                                   inputRTType.getElementType());

       auto broadcastOp = linalg::BroadcastOp::create(
           rewriter, loc, newInitValues[i], emptyTensor, broadcastedDims);

       newInitValues[i] = broadcastOp->getResult(0);
       isChanged = true;
     }
     newMap[i] = rewriter.getMultiDimIdentityMap(map.getNumDims());
   }

   if (!isChanged)
     return failure();

   SmallVector<Value> operands = op->getOperands();
   ValueRange operandsRef(operands);

   auto newOp = linalg::GenericOp::create(
       rewriter,
       /*location=*/op.getLoc(),
       /*resultTensorTypes=*/op->getResultTypes(),
       /*inputs=*/newInitValues,
       /*outputs=*/operandsRef.drop_front(op.getNumDpsInputs()),
       /*indexingMaps=*/newMap,
       /*iteratorTypes=*/op.getIteratorTypesArray());
   newOp.getRegion().takeBody(op->getRegion(0));
   rewriter.replaceOp(op, newOp->getResults());
   return success();
 }

 } // namespace

 void mlir::linalg::populateDecomposeProjectedPermutationPatterns(
     RewritePatternSet &patterns) {
   patterns.insert<DecomposeProjectedPermutation>(patterns.getContext());
 }
	//===- DecomposeGenericByUnfoldingPermutation.cpp -------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	#include "mlir/Dialect/Linalg/IR/Linalg.h"
	#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
	#include <map>
	#include <utility>

	using namespace mlir;
	using namespace mlir::linalg;

	namespace {

	/// This pattern decomposes the input operand(s) of a linalg.generic that has
	/// a `transpose`, `broadcast`, or a mixture of two, into explicit transpose
	/// and broadcast. Having them folded into the linalg.generic is a good
	/// optimization but sometimes we may want to unwrap, i.e., `unfold` them as
	/// explicit transpose and broadcast. This rewrite pattern helps do it for
	/// each input operand. This is useful for instance when trying to recognize
	/// named ops.
	///
	/// The transpose, broadcast, or mixture of both, are expressed in the affine
	/// map of the operand. Technically it is essentially `projected permutation`.
	///
	/// Example
	///
	/// ```mlir
	///
	/// #projection = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)>
	/// #identity = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
	/// ...
	/// %res = linalg.generic
	/// { indexing_maps = [#projection, #identity, #identity],
	/// iterator_types = ["parallel", "parallel", "parallel",
	/// "parallel", "parallel"]}
	/// ins(%x, %y : tensor<7x8x9xf32>, tensor<5x9x7x8x10xf32>)
	/// outs(%z : tensor<5x9x7x8x10xf32>) {
	/// ^bb0(%in: f32, %in_1: f32, %out: f32):
	/// %div = arith.divf %in, %in_1 : f32
	/// linalg.yield %div : f32
	/// } -> tensor<5x9x7x8x10xf32>
	/// ```
	///
	/// In the above IR operand `%x` map is a projected-permutation. This can be
	/// unfolded as:
	///
	/// ```mlir
	/// ...
	/// %x_trans = linalg.transpose
	/// ins(%x : tensor<7x8x9xf32>)
	/// outs(%e1 : tensor<9x7x8xf32>) permutation = [2, 0, 1]
	/// ...
	/// %x_trans_bc = linalg.broadcast
	/// ins(%x_trans : tensor<9x7x8xf32>)
	/// outs(%e2 : tensor<5x9x7x8x10xf32>) dimensions = [0, 4]
	/// %2 = linalg.div
	/// ins(%x_trans_bc, %y :
	/// tensor<5x9x7x8x10xf32>, tensor<5x9x7x8x10xf32>)
	/// outs(%arg2 : tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32>
	///
	/// Note that linalg.generic has been 'specialized' to linalg.div.
	///
	/// To unfold it, it is more optimal to transpose first and then do the
	/// broadcast. However, if transpose is done first, the permutation map needs
	/// to be expressed in terms of reduced dimension as broadcast hasn't happened
	/// yet. Also, the broadcast dimensions in a linalg.generic come from other
	/// operands (those not broadcasted along that particular dimension). We work
	/// this out by computing the convex-polyhedron shape of the linalg.generic
	/// iteration space from shapes of all the operands, both inputs and outputs.
	///
	struct DecomposeProjectedPermutation : public OpRewritePattern<GenericOp> {
	using OpRewritePattern<GenericOp>::OpRewritePattern;

	LogicalResult matchAndRewrite(GenericOp genericOp,
	PatternRewriter &rewriter) const override;
	};

	/// For the given `map`, determine what dimensions are transposed and what
	/// dimensions are broadcasted.
	/// Returns :
	/// transpose-permutation, broadcast-dimensions` (empty if not needed)
	///
	std::pair<SmallVector<int64_t>, SmallVector<int64_t>>
	computeTransposeBroadcast(AffineMap &map) {
	assert(map.isProjectedPermutation(false) && "not a projection");

	// As the map is a projection it likely operates on a smaller set of
	// dimensions as far as the transpose is concerned (rest are broadcast).
	int64_t minorSize = map.getNumResults();

	SmallVector<int64_t> minorResult;
	for (int64_t i = 0; i < minorSize; ++i) {
	auto expr = cast<AffineDimExpr>(map.getResults()[i]);
	minorResult.push_back(expr.getPosition());
	}

	// If dims are not monotonically increasing then transpose is present.
	SmallVector<int64_t> sortedResMap(minorResult);
	llvm::sort(sortedResMap);
	bool hasTranspose = !std::equal(minorResult.begin(), minorResult.end(),
	sortedResMap.begin(), sortedResMap.end());

	// Walk the sorted map result to determine which dimensions are broadcasted.
	SmallVector<int64_t> broadcast;
	for (int64_t i = 0, j = 0; i < map.getNumInputs(); ++i) {
	if (j < minorSize && sortedResMap[j] == i) {
	j++;
	continue;
	}
	broadcast.push_back(i);
	}

	SmallVector<int64_t> permutation;
	if (hasTranspose) {
	// Consider an operand `x : tensor<7x8x9>` of a genericOp that has
	// affine map `affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)>`
	// `x`s access is both transposed and broadcast. But when specifying
	// the `linalg.transpose(x : tensor<7x8x9>)` the dimensions need to be
	// specified as `affine_map<(d0,d1,d2) -> (d1, d2, d0)` instead of
	// refering to d3, d4. Therefore, re-base the transpose dimensions so
	// that they start from d0.
	permutation.resize(minorSize);
	std::map<int64_t, int64_t> minorMap;
	for (int64_t i = 0; i < minorSize; ++i)
	minorMap.insert({sortedResMap[i], i});

	// Re-map the dimensions.
	SmallVector<int64_t> remappedResult(minorSize);
	for (int64_t i = 0; i < minorSize; ++i)
	remappedResult[i] = minorMap[minorResult[i]];

	/// Calculate the permutation for the transpose.
	for (unsigned i = 0; i < minorSize; ++i) {
	permutation[remappedResult[i]] = i;
	}
	}
	return {permutation, broadcast};
	}

	LogicalResult DecomposeProjectedPermutation::matchAndRewrite(
	GenericOp op, PatternRewriter &rewriter) const {
	if (!op.hasPureTensorSemantics() \|\| op.isSingleInputOutput() \|\|
	op.isSingleYieldOp() \|\| !op.isAllParallelLoops())
	return failure();

	// If the map of an operand is not a `projected permutation` then
	// it cannot be decomposed to mere transpose and broadcast.
	// The requirement that all maps be `projected permutation` may be
	// over-restrictive but since we need to determine shape of the
	// iteration space as well, reject if any map violates assumption.
	for (auto &opOperand : op->getOpOperands()) {
	auto map = op.getMatchingIndexingMap(&opOperand);
	if (!map.isProjectedPermutation(false))
	return failure();
	}

	// Decomposing linalg.generic involves creating `tensor.empty`
	// which can have dynamic shapes but then we would have to work
	// out which operand can supply that runtime-value (tensor.dim).
	// Leaving it as a future TODO.
	if (llvm::any_of(op->getOpOperands(), [](OpOperand &oper) {
	auto opType = cast<RankedTensorType>(oper.get().getType());
	return ShapedType::isDynamicShape(opType.getShape());
	}))
	return failure();

	auto outputShape = op.getStaticLoopRanges();

	auto loc = op.getLoc();
	bool isChanged = false;
	SmallVector<Value> newInitValues = op.getDpsInputs();
	SmallVector<AffineMap> newMap = op.getIndexingMapsArray();

	// Walk over each input operand and unfold if it is transposed, broadcast
	// or mix of two via operand's affine-map.
	for (int64_t i = 0; i < op.getNumDpsInputs(); ++i) {
	auto &map = newMap[i];
	auto inputRTType = cast<RankedTensorType>(newInitValues[i].getType());
	auto elType = inputRTType.getElementType();

	/// Nothing to do if map is already an identity.
	if (map.isIdentity())
	continue;

	auto [permutation, broadcastedDims] = computeTransposeBroadcast(map);

	// Does it need transpose?
	if (!permutation.empty()) {
	/// linalg.transpose permutes the dimensions of input using
	/// rule: dim(result, i) = dim(input, permutation[i])
	SmallVector<int64_t> transposedShape(map.getNumResults());
	for (int64_t i = 0; i < map.getNumResults(); ++i)
	transposedShape[i] = inputRTType.getShape()[permutation[i]];

	Value emptyTensor =
	tensor::EmptyOp::create(rewriter, loc, transposedShape, elType);

	auto transposeOp = TransposeOp::create(rewriter, loc, newInitValues[i],
	emptyTensor, permutation);
	newInitValues[i] = transposeOp->getResult(0);
	isChanged = true;
	}

	// Does it require broadcast?
	if (!broadcastedDims.empty()) {
	assert(!broadcastedDims.empty() && "should have non size broadcast");
	Value emptyTensor = tensor::EmptyOp::create(rewriter, loc, outputShape,
	inputRTType.getElementType());

	auto broadcastOp = linalg::BroadcastOp::create(
	rewriter, loc, newInitValues[i], emptyTensor, broadcastedDims);

	newInitValues[i] = broadcastOp->getResult(0);
	isChanged = true;
	}
	newMap[i] = rewriter.getMultiDimIdentityMap(map.getNumDims());
	}

	if (!isChanged)
	return failure();

	SmallVector<Value> operands = op->getOperands();
	ValueRange operandsRef(operands);

	auto newOp = linalg::GenericOp::create(
	rewriter,
	/location=/op.getLoc(),
	/resultTensorTypes=/op->getResultTypes(),
	/inputs=/newInitValues,
	/outputs=/operandsRef.drop_front(op.getNumDpsInputs()),
	/indexingMaps=/newMap,
	/iteratorTypes=/op.getIteratorTypesArray());
	newOp.getRegion().takeBody(op->getRegion(0));
	rewriter.replaceOp(op, newOp->getResults());
	return success();
	}

	} // namespace

	void mlir::linalg::populateDecomposeProjectedPermutationPatterns(
	RewritePatternSet &patterns) {
	patterns.insert<DecomposeProjectedPermutation>(patterns.getContext());
	}