lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp - llvm-project/mlir - Git at Google

 //===- ConvertToDestinationStyle.cpp - Convert non-DPS to DPS ops ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains patterns to convert non-DPS ops to DPS ops. New
 // tensor.empty ops are inserted as a destination. Such tensor.empty can be
 // eliminated with "empty tensor elimination", allowing them to bufferize
 // without an allocation (assuming there are no further conflicts).
 //
 //===----------------------------------------------------------------------===//
 //
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "llvm/ADT/STLExtras.h"

 using namespace mlir;
 using namespace mlir::tensor;

 // Implements backtracking to traverse indices of the output buffer while
 // iterating over op.elements().
 static Value createInserts(RewriterBase &rewriter, Location loc, int dim,
                            Value destination, ArrayRef<int64_t> shape,
                            ArrayRef<Value> constants,
                            OperandRange::iterator &elementIt,
                            SmallVectorImpl<Value> &indices) {
   if (dim == static_cast<int>(shape.size()) - 1) {
     for (int i = 0; i < shape.back(); ++i) {
       indices.back() = constants[i];
       destination = tensor::InsertOp::create(rewriter, loc, *elementIt,
                                              destination, indices);
       ++elementIt;
     }
     return destination;
   }
   for (int i = 0; i < shape[dim]; ++i) {
     indices[dim] = constants[i];
     destination = createInserts(rewriter, loc, dim + 1, destination, shape,
                                 constants, elementIt, indices);
   }
   return destination;
 }

 /// Create a memcpy from the given source tensor to the given destination
 /// memref. The copy op type can be specified in the `options`.
 static void createMemcpy(OpBuilder &b, Location loc, Value tensorSource,
                          Value memrefDest,
                          const linalg::BufferizeToAllocationOptions &options) {
   auto tensorType = dyn_cast<RankedTensorType>(tensorSource.getType());
   assert(tensorType && "expected ranked tensor");
   assert(isa<MemRefType>(memrefDest.getType()) && "expected ranked memref");

   switch (options.memcpyOp) {
   case linalg::BufferizeToAllocationOptions::MemcpyOp::
       MaterializeInDestination: {
     // Note: This is the preferred way of memcpy'ing because no layout map
     // and/or memory space must be specified for the source.
     auto materializeOp = bufferization::MaterializeInDestinationOp::create(
         b, loc, tensorSource, memrefDest);
     materializeOp.setWritable(true);
   } break;
   case linalg::BufferizeToAllocationOptions::MemcpyOp::MemrefCopy: {
     // TODO: Support custom memory space on source.
     // We do not know the layout map of the source yet, so use a fully dynamic
     // layout for best compatibility.
     Value toBuffer = bufferization::ToBufferOp::create(
         b, loc, bufferization::getMemRefTypeWithFullyDynamicLayout(tensorType),
         tensorSource, /*read_only=*/true);
     memref::CopyOp::create(b, loc, toBuffer, memrefDest);
   } break;
   case linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy: {
     // TODO: Support custom memory space on source.
     // We do not know the layout map of the source yet, so use a fully dynamic
     // layout for best compatibility.
     Value toBuffer = bufferization::ToBufferOp::create(
         b, loc, bufferization::getMemRefTypeWithFullyDynamicLayout(tensorType),
         tensorSource, /*read_only=*/true);
     linalg::CopyOp::create(b, loc, toBuffer, memrefDest);
   } break;
   };
 }

 static Operation *movePaddingToFillOrGenericOp(RewriterBase &rewriter,
                                                Location loc, PadOp padOp,
                                                Value dest) {
   OpBuilder::InsertionGuard g(rewriter);
   RankedTensorType resultType = padOp.getResultType();

   // Examine the yielded value to decide if a linalg.generic is neede or a
   // linalg.fill is sufficient.
   Value yieldedValue =
       cast<tensor::YieldOp>(padOp.getBody()->getTerminator()).getValue();
   Attribute constYieldedValue;
   // Is the yielded value a bbArg defined outside of the PadOp?
   bool outsideBbArg =
       isa<BlockArgument>(yieldedValue) &&
       cast<BlockArgument>(yieldedValue).getOwner()->getParentOp() !=
           padOp.getOperation();
   // Is the yielded value an OpResult defined outside of the PadOp?
   bool outsideOpResult =
       isa<OpResult>(yieldedValue) &&
       yieldedValue.getDefiningOp()->getParentOp() != padOp.getOperation();
   bool invariantYieldedValue = outsideBbArg || outsideOpResult;
   if (matchPattern(yieldedValue, m_Constant(&constYieldedValue))) {
     // Padding with a constant: Create linalg.fill.
     Dialect *arithDialect =
         rewriter.getContext()->getLoadedDialect<arith::ArithDialect>();
     Value fillValue =
         arithDialect
             ->materializeConstant(rewriter, constYieldedValue,
                                   yieldedValue.getType(), yieldedValue.getLoc())
             ->getResult(0);
     auto fillOp = linalg::FillOp::create(rewriter, loc, ValueRange(fillValue),
                                          ValueRange(dest));
     return fillOp;
   }

   if (invariantYieldedValue) {
     // Padding with an invariant value.
     auto fillOp = linalg::FillOp::create(
         rewriter, loc, ValueRange(yieldedValue), ValueRange(dest));
     return fillOp;
   }

   // Create linalg.generic.
   SmallVector<utils::IteratorType> iteratorTypes(resultType.getRank(),
                                                  utils::IteratorType::parallel);
   SmallVector<AffineMap> indexingMaps(
       1, rewriter.getMultiDimIdentityMap(resultType.getRank()));
   auto genericOp = linalg::GenericOp::create(
       rewriter, loc, resultType, /*inputs=*/ValueRange(),
       /*outputs=*/ValueRange{dest}, /*indexingMaps=*/
       indexingMaps, iteratorTypes);
   Block *body = rewriter.createBlock(&genericOp->getRegion(0), {},
                                      resultType.getElementType(), loc);
   rewriter.setInsertionPointToStart(body);
   SmallVector<Value> bbArgReplacements;
   for (int64_t i = 0; i < resultType.getRank(); ++i)
     bbArgReplacements.push_back(linalg::IndexOp::create(rewriter, loc, i));
   rewriter.mergeBlocks(padOp.getBody(), body, bbArgReplacements);

   // Update terminator.
   auto yieldOp = cast<tensor::YieldOp>(body->getTerminator());
   rewriter.replaceOpWithNewOp<linalg::YieldOp>(yieldOp, yieldOp.getValue());
   return genericOp;
 }

 static SmallVector<Value> reifyOrComputeDynamicSizes(OpBuilder &b,
                                                      Value value) {
   auto tensorType = cast<RankedTensorType>(value.getType());
   if (tensorType.hasStaticShape())
     return {};

   // Try to reify dynamic sizes.
   ReifiedRankedShapedTypeDims reifiedShape;
   if (isa<OpResult>(value) &&
       succeeded(reifyResultShapes(b, value.getDefiningOp(), reifiedShape))) {
     SmallVector<Value> dynSizes;
     for (int64_t i = 0; i < tensorType.getRank(); ++i) {
       if (tensorType.isDynamicDim(i))
         dynSizes.push_back(cast<Value>(
             reifiedShape[cast<OpResult>(value).getResultNumber()][i]));
     }
     return dynSizes;
   }

   // Create tensor.dim ops.
   SmallVector<Value> dynSizes;
   for (int64_t i = 0; i < tensorType.getRank(); ++i) {
     if (tensorType.isDynamicDim(i))
       dynSizes.push_back(
           DimOp::create(b, value.getLoc(), value,
                         arith::ConstantIndexOp::create(b, value.getLoc(), i)));
   }
   return dynSizes;
 }

 static Value
 createAllocationForTensor(RewriterBase &rewriter, Location loc, Value value,
                           const linalg::BufferizeToAllocationOptions &options,
                           Attribute memorySpace = {}) {
   OpBuilder::InsertionGuard g(rewriter);
   auto tensorType = cast<RankedTensorType>(value.getType());

   // Create buffer allocation.
   auto memrefType =
       cast<MemRefType>(bufferization::getMemRefTypeWithStaticIdentityLayout(
           tensorType, memorySpace));
   SmallVector<Value> dynamicSizes = reifyOrComputeDynamicSizes(rewriter, value);

   Value alloc;
   if (options.allocOp ==
       linalg::BufferizeToAllocationOptions::AllocOp::MemrefAlloc) {
     alloc = memref::AllocOp::create(rewriter, loc, memrefType, dynamicSizes);
     if (options.emitDealloc) {
       // Place deallocation at the end of the block.
       rewriter.setInsertionPoint(rewriter.getInsertionBlock()->getTerminator());
       memref::DeallocOp::create(rewriter, loc, alloc);
     }
   } else if (options.allocOp ==
              linalg::BufferizeToAllocationOptions::AllocOp::MemrefAlloca) {
     alloc = memref::AllocaOp::create(rewriter, loc, memrefType, dynamicSizes);
     // No dealloc is needed.
   }

   return alloc;
 }

 Value linalg::bufferizeToAllocation(
     RewriterBase &rewriter, const linalg::BufferizeToAllocationOptions &options,
     PadOp padOp, Attribute memorySpace, Operation *insertionPoint) {
   // tensor.pad does not have a destination operand.
   assert(!options.bufferizeDestinationOnly && "invalid options");

   OpBuilder::InsertionGuard g(rewriter);
   rewriter.setInsertionPoint(insertionPoint ? insertionPoint : padOp);
   Location loc = padOp.getLoc();

   // Create buffer allocation.
   Value alloc = createAllocationForTensor(rewriter, loc, padOp.getResult(),
                                           options, memorySpace);
   rewriter.setInsertionPoint(padOp);

   if (!padOp.hasZeroLowPad() || !padOp.hasZeroHighPad()) {
     // Create linalg.fill or linalg.generic. Not needed if there is no padding.
     Operation *fillOp =
         movePaddingToFillOrGenericOp(rewriter, loc, padOp, alloc);
     rewriter.setInsertionPointAfter(fillOp);
   }

   // Create memcpy.
   SmallVector<OpFoldResult> sizes =
       getMixedSizes(rewriter, loc, padOp.getSource());
   SmallVector<OpFoldResult> strides(padOp.getResultType().getRank(),
                                     rewriter.getIndexAttr(1));
   Value subview = memref::SubViewOp::create(
       rewriter, loc, alloc, /*offsets=*/padOp.getMixedLowPad(), sizes, strides);
   createMemcpy(rewriter, loc, padOp.getSource(), subview, options);

   // Create bufferization.to_tensor with "restrict" and "writable". The returned
   // tensor is a new buffer allocation, so it does not alias with any buffer.
   Value toTensorOp = bufferization::ToTensorOp::create(
       rewriter, loc, padOp.getResult().getType(), alloc, /*restrict=*/true,
       /*writable=*/true);
   rewriter.replaceOp(padOp, toTensorOp);
   return alloc;
 }

 Value linalg::bufferizeToAllocation(
     RewriterBase &rewriter, const linalg::BufferizeToAllocationOptions &options,
     vector::MaskOp maskOp, Attribute memorySpace, Operation *insertionPoint) {
   assert(llvm::range_size(maskOp.getMaskBlock()->without_terminator()) == 1 &&
          "expected single masked op");
   OpBuilder::InsertionGuard g(rewriter);

   // Should the bufferization options and state be function arguments?
   bufferization::BufferizationOptions bufferizationOptions;
   bufferization::BufferizationState bufferizationState;

   Operation *yieldOp = maskOp.getMaskRegion().front().getTerminator();
   assert(isa<vector::YieldOp>(yieldOp) && "expected yield op terminator");

   // Bufferize maskable op. By default, place the buffer allocation right before
   // the mask op.
   Value alloc = bufferizeToAllocation(
       rewriter, options, maskOp.getMaskableOp(), memorySpace,
       /*insertionPoint=*/insertionPoint ? insertionPoint : maskOp);

   if (options.bufferizeDestinationOnly)
     return alloc;

   // Bufferize terminator.
   rewriter.setInsertionPoint(yieldOp);
   if (failed(cast<bufferization::BufferizableOpInterface>(yieldOp).bufferize(
           rewriter, bufferizationOptions, bufferizationState)))
     return nullptr;

   // Erase dead to_tensor ops inside of the mask op. This is necessary because
   // there only be one op (apart from the terminator) inside the mask op.
   // TODO: Remove dead to_tensor ops more aggressively during bufferization.
   SmallVector<Operation *> toTensorOps;
   maskOp.walk([&](bufferization::ToTensorOp toTensorOp) {
     if (toTensorOp->getUses().empty())
       toTensorOps.push_back(toTensorOp.getOperation());
   });
   for (Operation *op : toTensorOps)
     rewriter.eraseOp(op);

   // Bufferize mask op.
   SmallVector<OpOperand *> resultUses;
   for (Value result : maskOp.getResults())
     if (isa<TensorType>(result.getType()))
       for (OpOperand &use : result.getUses())
         resultUses.push_back(&use);
   rewriter.setInsertionPoint(maskOp);
   if (failed(
           cast<bufferization::BufferizableOpInterface>(maskOp.getOperation())
               .bufferize(rewriter, bufferizationOptions, bufferizationState)))
     return nullptr;

   // Set "restrict" attribute, indicating that no other tensor aliases with
   // this tensor. That is because we just allocated a new buffer for the tensor.
   for (OpOperand *resultUse : resultUses) {
     auto toTensorOp =
         resultUse->get().getDefiningOp<bufferization::ToTensorOp>();
     assert(toTensorOp && "expected to_tensor op");
     rewriter.modifyOpInPlace(toTensorOp, [&]() {
       toTensorOp.setRestrict(true);
       toTensorOp.setWritable(true);
     });
   }

   return alloc;
 }

 Value linalg::bufferizeToAllocation(
     RewriterBase &rewriter, const linalg::BufferizeToAllocationOptions &options,
     bufferization::AllocTensorOp allocTensorOp, Attribute memorySpace,
     Operation *insertionPoint) {
   Location loc = allocTensorOp.getLoc();
   OpBuilder::InsertionGuard g(rewriter);
   rewriter.setInsertionPoint(insertionPoint ? insertionPoint : allocTensorOp);
   bufferization::BufferizationOptions bufferizationOptions;

   // Create buffer allocation.
   Value alloc = createAllocationForTensor(
       rewriter, loc, allocTensorOp.getResult(), options, memorySpace);

   // Create bufferization.to_tensor with "restrict" and "writable". The returned
   // tensor is a new buffer allocation, so it does not alias with any buffer.
   Value toTensorOp = bufferization::ToTensorOp::create(
       rewriter, loc, allocTensorOp.getResult().getType(), alloc,
       /*restrict=*/true,
       /*writable=*/true);
   rewriter.replaceOp(allocTensorOp, toTensorOp);
   return alloc;
 }

 /// Lower tensor.from_elements to a sequence of chained tensor.insert.
 FailureOr<Operation *> mlir::linalg::rewriteInDestinationPassingStyle(
     RewriterBase &rewriter, tensor::FromElementsOp fromElementsOp) {
   Location loc = fromElementsOp.getLoc();
   RankedTensorType tensorType =
       cast<RankedTensorType>(fromElementsOp.getType());
   auto shape = tensorType.getShape();

   // Create tensor.empty.
   auto emptyOp = EmptyOp::create(rewriter, loc, tensorType, ValueRange());

   // Case: tensor<elem_type>.
   if (shape.empty()) {
     Operation *res = rewriter.replaceOpWithNewOp<tensor::InsertOp>(
         fromElementsOp, fromElementsOp.getElements().front(),
         emptyOp.getResult(), ValueRange());
     return res;
   }

   // Create constants for the range of possible indices [0, max{shape_i}).
   auto maxDim = *llvm::max_element(shape);
   SmallVector<Value, 2> constants;
   constants.reserve(maxDim);
   for (int i = 0; i < maxDim; ++i)
     constants.push_back(arith::ConstantIndexOp::create(rewriter, loc, i));

   // Traverse all elements and create tensor.insert ops.
   auto elementIt = fromElementsOp.getElements().begin();
   SmallVector<Value, 2> indices(tensorType.getRank(), constants[0]);
   Value result = createInserts(rewriter, loc, /*dim=*/0, emptyOp.getResult(),
                                shape, constants, elementIt, indices);

   // Replace tensor.from_elements.
   rewriter.replaceOp(fromElementsOp, result);
   return result.getDefiningOp();
 }

 /// Lower tensor.generate to linalg.generic.
 FailureOr<Operation *>
 mlir::linalg::rewriteInDestinationPassingStyle(RewriterBase &rewriter,
                                                tensor::GenerateOp generateOp) {
   // Only ops with exactly one block are supported.
   if (!generateOp.getBody().hasOneBlock())
     return failure();

   Location loc = generateOp.getLoc();
   RankedTensorType tensorType = cast<RankedTensorType>(generateOp.getType());

   // Create tensor.empty.
   auto emptyOp = EmptyOp::create(rewriter, loc, tensorType,
                                  generateOp.getDynamicExtents());

   // Create linalg.generic.
   SmallVector<utils::IteratorType> iteratorTypes(tensorType.getRank(),
                                                  utils::IteratorType::parallel);
   SmallVector<AffineMap> indexingMaps(
       1, rewriter.getMultiDimIdentityMap(tensorType.getRank()));
   auto genericOp = linalg::GenericOp::create(
       rewriter, loc, tensorType, /*inputs=*/ValueRange(),
       /*outputs=*/ValueRange{emptyOp.getResult()}, /*indexingMaps=*/
       indexingMaps, iteratorTypes);
   Block *body = rewriter.createBlock(&genericOp->getRegion(0), {},
                                      tensorType.getElementType(), loc);
   rewriter.setInsertionPointToStart(body);
   SmallVector<Value> bbArgReplacements;
   for (int64_t i = 0; i < tensorType.getRank(); ++i)
     bbArgReplacements.push_back(linalg::IndexOp::create(rewriter, loc, i));
   rewriter.mergeBlocks(&generateOp.getBody().front(), body, bbArgReplacements);

   // Update terminator.
   auto yieldOp = cast<tensor::YieldOp>(body->getTerminator());
   rewriter.replaceOpWithNewOp<linalg::YieldOp>(yieldOp, yieldOp.getValue());

   // Replace tensor.generate.
   rewriter.replaceOp(generateOp, genericOp->getResult(0));
   return genericOp.getOperation();
 }

 /// Lower tensor.pad to linalg.generic + tensor.insert_slice.
 FailureOr<Operation *>
 mlir::linalg::rewriteInDestinationPassingStyle(RewriterBase &rewriter,
                                                tensor::PadOp padOp) {
   // Only ops with exactly one block are supported.
   if (!padOp.getBodyRegion().hasOneBlock())
     return failure();

   // Create tensor.empty.
   Location loc = padOp.getLoc();
   RankedTensorType resultType = padOp.getResultType();
   ReifiedRankedShapedTypeDims reifiedShape;
   if (failed(reifyResultShapes(rewriter, padOp, reifiedShape)))
     return rewriter.notifyMatchFailure(
         padOp, "failed to reify tensor.pad op result shape");
   SmallVector<Value> dynamicSizes;
   for (int64_t i = 0; i < resultType.getRank(); ++i)
     if (resultType.isDynamicDim(i))
       dynamicSizes.push_back(cast<Value>(reifiedShape[0][i]));

   // If the `padOp` has a nofold attribute and all paddings are known to be 0,
   // explicitly insert a `linalg.copy`.
   if (padOp.getNofoldAttr() &&
       llvm::all_of(padOp.getMixedLowPad(), isZeroInteger) &&
       llvm::all_of(padOp.getMixedHighPad(), isZeroInteger)) {
     using bufferization::AllocTensorOp;
     Value allocated =
         AllocTensorOp::create(rewriter, loc, resultType, dynamicSizes);
     auto copyOp = rewriter.replaceOpWithNewOp<linalg::CopyOp>(
         padOp, padOp.getSource(), allocated);
     return copyOp.getOperation();
   }

   Value empty = EmptyOp::create(rewriter, loc, resultType, dynamicSizes);
   // Create linalg.fill or linalg.generic.
   Operation *fillOp = movePaddingToFillOrGenericOp(rewriter, loc, padOp, empty);
   rewriter.setInsertionPointAfter(fillOp);

   // Create tensor::InsertSliceOp.
   SmallVector<OpFoldResult> sliceSizes =
       getMixedSizes(rewriter, loc, padOp.getSource());
   SmallVector<OpFoldResult> sliceStrides(resultType.getRank(),
                                          rewriter.getIndexAttr(1));
   auto insertSliceOp = rewriter.replaceOpWithNewOp<tensor::InsertSliceOp>(
       padOp, padOp.getSource(), fillOp->getResult(0),
       /*offsets=*/padOp.getMixedLowPad(), sliceSizes, sliceStrides);
   return insertSliceOp.getOperation();
 }

 Value linalg::bufferizeToAllocation(
     RewriterBase &rewriter, const linalg::BufferizeToAllocationOptions &options,
     Operation *op, Attribute memorySpace, Operation *insertionPoint) {
   using namespace bufferization;

   // Call specialized overload for certain ops.
   if (auto padOp = dyn_cast<tensor::PadOp>(op))
     return bufferizeToAllocation(rewriter, options, padOp, memorySpace);
   if (auto maskOp = dyn_cast<vector::MaskOp>(op))
     return bufferizeToAllocation(rewriter, options, maskOp, memorySpace);
   if (auto allocTensorOp = dyn_cast<bufferization::AllocTensorOp>(op))
     return bufferizeToAllocation(rewriter, options, allocTensorOp, memorySpace);

   // Only bufferizable ops are supported.
   auto bufferizableOp = dyn_cast<BufferizableOpInterface>(op);
   if (!bufferizableOp)
     return nullptr;

   // Should the bufferization options and states be function arguments?
   BufferizationOptions bufferizationOptions;
   AnalysisState analysisState(bufferizationOptions);
   BufferizationState bufferizationState;

 #ifndef NDEBUG
   if (!options.bufferizeDestinationOnly) {
     // Ops with nested tensor ops are not supported yet. At the moment, this
     // function just bufferizes the given op itself, but not its body.
     op->walk([&](Operation *nestedOp) {
       if (op == nestedOp)
         return;
       if (llvm::any_of(nestedOp->getOperands(),
                        [](Value v) { return isa<TensorType>(v.getType()); }))
         llvm_unreachable("ops with nested tensor ops are not supported yet");
       if (llvm::any_of(nestedOp->getResults(),
                        [](Value v) { return isa<TensorType>(v.getType()); }))
         llvm_unreachable("ops with nested tensor ops are not supported yet");
     });
   }
 #endif // NDEBUG

   // Gather tensor results.
   SmallVector<OpResult> tensorResults;
   for (OpResult result : op->getResults()) {
     if (!isa<TensorType>(result.getType()))
       continue;
     // Unranked tensors are not supported
     if (!isa<RankedTensorType>(result.getType()))
       return nullptr;
     // Ops that bufferize to an allocation are not supported.
     if (bufferizableOp.bufferizesToAllocation(result))
       return nullptr;
     tensorResults.push_back(result);
   }

   // Gather all operands that should bufferize to a new allocation. I.e.,
   // bufferize out-of-place.
   SmallVector<OpOperand *> outOfPlaceOperands, resultUses;
   auto addOutOfPlaceOperand = [&](OpOperand *operand) {
     if (!llvm::is_contained(outOfPlaceOperands, operand))
       outOfPlaceOperands.push_back(operand);
   };
   for (OpResult result : tensorResults) {
     AliasingOpOperandList aliasingOperands =
         analysisState.getAliasingOpOperands(result);
     for (const AliasingOpOperand &operand : aliasingOperands) {
       addOutOfPlaceOperand(operand.opOperand);
       for (OpOperand &resultUse : result.getUses())
         resultUses.push_back(&resultUse);
     }
   }
   for (OpOperand &operand : op->getOpOperands()) {
     if (!analysisState.bufferizesToMemoryWrite(operand))
       continue;
     if (!isa<RankedTensorType>(operand.get().getType()))
       continue;
     addOutOfPlaceOperand(&operand);
   }
   // TODO: Support multiple buffers.
   if (outOfPlaceOperands.size() != 1)
     return nullptr;

   // Allocate buffers.
   OpBuilder::InsertionGuard g(rewriter);
   rewriter.setInsertionPoint(insertionPoint ? insertionPoint : op);
   SmallVector<Value> allocs;
   for (OpOperand *operand : outOfPlaceOperands) {
     Value alloc = createAllocationForTensor(
         rewriter, op->getLoc(), operand->get(), options, memorySpace);
     allocs.push_back(alloc);
     if (!analysisState.findDefinitions(operand).empty()) {
       // Initialize buffer with a copy of the operand data. Not needed if the
       // tensor is uninitialized.
       createMemcpy(rewriter, op->getLoc(), operand->get(), alloc, options);
     }
     rewriter.modifyOpInPlace(op, [&]() {
       auto toTensorOp = ToTensorOp::create(rewriter, op->getLoc(),
                                            operand->get().getType(), alloc);
       operand->set(toTensorOp);
       if (options.bufferizeDestinationOnly) {
         rewriter.modifyOpInPlace(toTensorOp, [&]() {
           toTensorOp.setRestrict(true);
           toTensorOp.setWritable(true);
         });
       }
     });
   }

   if (options.bufferizeDestinationOnly)
     return allocs.front();

   // Bufferize the op.
   rewriter.setInsertionPoint(op);
   if (failed(bufferizableOp.bufferize(rewriter, bufferizationOptions,
                                       bufferizationState)))
     return nullptr;

   // Set "restrict" attribute, indicating that no other tensor aliases with
   // this tensor. That is because we just allocated a new buffer for the tensor.
   for (OpOperand *resultUse : resultUses) {
     auto toTensorOp = resultUse->get().getDefiningOp<ToTensorOp>();
     assert(toTensorOp && "expected to_tensor op");
     rewriter.modifyOpInPlace(toTensorOp, [&]() {
       toTensorOp.setRestrict(true);
       toTensorOp.setWritable(true);
     });
   }
   return allocs.front();
 }

 namespace {

 template <typename OpTy>
 LogicalResult rewriteOpInDestinationPassingStyle(OpTy op,
                                                  PatternRewriter &rewriter) {
   return linalg::rewriteInDestinationPassingStyle(rewriter, op);
 }

 } // namespace

 void linalg::populateConvertToDestinationStylePatterns(
     RewritePatternSet &patterns) {
   patterns.add(rewriteOpInDestinationPassingStyle<tensor::FromElementsOp>);
   patterns.add(rewriteOpInDestinationPassingStyle<tensor::GenerateOp>);
   patterns.add(rewriteOpInDestinationPassingStyle<tensor::PadOp>);
 }