lib/Conversion/VectorToSCF/VectorToSCF.cpp - llvm-project/mlir - Git at Google

 //===- VectorToSCF.cpp - Convert vector to SCF dialect ----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements lowering of vector transfer operations to SCF.
 //
 //===----------------------------------------------------------------------===//

 #include <type_traits>

 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"

 #include "../PassDetail.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/Vector/VectorTransforms.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"

 using namespace mlir;
 using vector::TransferReadOp;
 using vector::TransferWriteOp;

 namespace {

 /// Attribute name used for labeling transfer ops during progressive lowering.
 static const char kPassLabel[] = "__vector_to_scf_lowering__";

 /// Patterns that inherit from this struct have access to
 /// VectorTransferToSCFOptions.
 template <typename OpTy>
 struct VectorToSCFPattern : public OpRewritePattern<OpTy> {
   explicit VectorToSCFPattern(MLIRContext *context,
                               VectorTransferToSCFOptions opt)
       : OpRewritePattern<OpTy>(context), options(opt) {}

   VectorTransferToSCFOptions options;
 };

 /// Given a vector transfer op, calculate which dimension of the `source`
 /// memref should be unpacked in the next application of TransferOpConversion.
 /// A return value of None indicates a broadcast.
 template <typename OpTy>
 static Optional<int64_t> unpackedDim(OpTy xferOp) {
   // TODO: support 0-d corner case.
   assert(xferOp.getTransferRank() > 0 && "unexpected 0-d transfer");
   auto map = xferOp.permutation_map();
   if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
     return expr.getPosition();
   }
   assert(xferOp.isBroadcastDim(0) &&
          "Expected AffineDimExpr or AffineConstantExpr");
   return None;
 }

 /// Compute the permutation map for the new (N-1)-D vector transfer op. This
 /// map is identical to the current permutation map, but the first result is
 /// omitted.
 template <typename OpTy>
 static AffineMap unpackedPermutationMap(OpBuilder &b, OpTy xferOp) {
   // TODO: support 0-d corner case.
   assert(xferOp.getTransferRank() > 0 && "unexpected 0-d transfer");
   auto map = xferOp.permutation_map();
   return AffineMap::get(map.getNumDims(), 0, map.getResults().drop_front(),
                         b.getContext());
 }

 /// Calculate the indices for the new vector transfer op.
 ///
 /// E.g.: transfer_read %A[%a, %b, %c, %d] ... : vector<5x4x3xf32> ...
 ///       --> transfer_read %A[%a, %b + iv, %c, %d] ... vector<4x3f32>
 ///                                 ^^^^^^
 ///              `iv` is the iteration variable of the (new) surrounding loop.
 template <typename OpTy>
 static void getXferIndices(OpBuilder &b, OpTy xferOp, Value iv,
                            SmallVector<Value, 8> &indices) {
   typename OpTy::Adaptor adaptor(xferOp);
   // Corresponding memref dim of the vector dim that is unpacked.
   auto dim = unpackedDim(xferOp);
   auto prevIndices = adaptor.indices();
   indices.append(prevIndices.begin(), prevIndices.end());

   Location loc = xferOp.getLoc();
   bool isBroadcast = !dim.hasValue();
   if (!isBroadcast) {
     AffineExpr d0, d1;
     bindDims(xferOp.getContext(), d0, d1);
     Value offset = adaptor.indices()[dim.getValue()];
     indices[dim.getValue()] =
         makeComposedAffineApply(b, loc, d0 + d1, {offset, iv});
   }
 }

 static void maybeYieldValue(OpBuilder &b, Location loc, bool hasRetVal,
                             Value value) {
   if (hasRetVal) {
     assert(value && "Expected non-empty value");
     b.create<scf::YieldOp>(loc, value);
   } else {
     b.create<scf::YieldOp>(loc);
   }
 }

 /// Generates a boolean Value that is true if the iv-th bit in xferOp's mask
 /// is set to true. No such check is generated under following circumstances:
 /// * xferOp does not have a mask.
 /// * xferOp's mask is not 1D. (In case of (N>1)-D, a subvector of the mask is
 ///   computed and attached to the new transfer op in the pattern.)
 /// * The to-be-unpacked dim of xferOp is a broadcast.
 template <typename OpTy>
 static Value generateMaskCheck(OpBuilder &b, OpTy xferOp, Value iv) {
   if (!xferOp.mask())
     return Value();
   if (xferOp.getMaskType().getRank() != 1)
     return Value();
   if (xferOp.isBroadcastDim(0))
     return Value();

   Location loc = xferOp.getLoc();
   return b.create<vector::ExtractElementOp>(loc, xferOp.mask(), iv);
 }

 /// Helper function TransferOpConversion and TransferOp1dConversion.
 /// Generate an in-bounds check if the transfer op may go out-of-bounds on the
 /// specified dimension `dim` with the loop iteration variable `iv`.
 /// E.g., when unpacking dimension 0 from:
 /// ```
 /// %vec = vector.transfer_read %A[%a, %b] %cst
 ///     : vector<5x4xf32>, memref<?x?xf32>
 /// ```
 /// An if check similar to this will be generated inside the loop:
 /// ```
 /// %d = memref.dim %A, %c0 : memref<?x?xf32>
 /// if (%a + iv < %d) {
 ///   (in-bounds case)
 /// } else {
 ///   (out-of-bounds case)
 /// }
 /// ```
 ///
 /// If the transfer is 1D and has a mask, this function generates a more complex
 /// check also accounts for potentially masked out elements.
 ///
 /// This function variant returns the value returned by `inBoundsCase` or
 /// `outOfBoundsCase`. The MLIR type of the return value must be specified in
 /// `resultTypes`.
 template <typename OpTy>
 static Value generateInBoundsCheck(
     OpBuilder &b, OpTy xferOp, Value iv, Optional<int64_t> dim,
     TypeRange resultTypes,
     function_ref<Value(OpBuilder &, Location)> inBoundsCase,
     function_ref<Value(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
   bool hasRetVal = !resultTypes.empty();
   Value cond; // Condition to be built...

   // Condition check 1: Access in-bounds?
   bool isBroadcast = !dim.hasValue(); // No in-bounds check for broadcasts.
   Location loc = xferOp.getLoc();
   ImplicitLocOpBuilder lb(xferOp.getLoc(), b);
   if (!xferOp.isDimInBounds(0) && !isBroadcast) {
     Value memrefDim = vector::createOrFoldDimOp(b, loc, xferOp.source(), *dim);
     AffineExpr d0, d1;
     bindDims(xferOp.getContext(), d0, d1);
     Value base = xferOp.indices()[dim.getValue()];
     Value memrefIdx = makeComposedAffineApply(b, loc, d0 + d1, {base, iv});
     cond = lb.create<arith::CmpIOp>(arith::CmpIPredicate::sgt, memrefDim,
                                     memrefIdx);
   }

   // Condition check 2: Masked in?
   if (auto maskCond = generateMaskCheck(b, xferOp, iv)) {
     if (cond)
       cond = lb.create<arith::AndIOp>(cond, maskCond);
     else
       cond = maskCond;
   }

   // If the condition is non-empty, generate an SCF::IfOp.
   if (cond) {
     auto check = lb.create<scf::IfOp>(
         resultTypes, cond,
         /*thenBuilder=*/
         [&](OpBuilder &b, Location loc) {
           maybeYieldValue(b, loc, hasRetVal, inBoundsCase(b, loc));
         },
         /*elseBuilder=*/
         [&](OpBuilder &b, Location loc) {
           if (outOfBoundsCase) {
             maybeYieldValue(b, loc, hasRetVal, outOfBoundsCase(b, loc));
           } else {
             b.create<scf::YieldOp>(loc);
           }
         });

     return hasRetVal ? check.getResult(0) : Value();
   }

   // Condition is empty, no need for an SCF::IfOp.
   return inBoundsCase(b, loc);
 }

 /// In this function variant, `inBoundsCase` and `outOfBoundsCase` do not have
 /// a return value. Consequently, this function does not have a return value.
 template <typename OpTy>
 static void generateInBoundsCheck(
     OpBuilder &b, OpTy xferOp, Value iv, Optional<int64_t> dim,
     function_ref<void(OpBuilder &, Location)> inBoundsCase,
     function_ref<void(OpBuilder &, Location)> outOfBoundsCase = nullptr) {
   generateInBoundsCheck(
       b, xferOp, iv, dim, /*resultTypes=*/TypeRange(),
       /*inBoundsCase=*/
       [&](OpBuilder &b, Location loc) {
         inBoundsCase(b, loc);
         return Value();
       },
       /*outOfBoundsCase=*/
       [&](OpBuilder &b, Location loc) {
         if (outOfBoundsCase)
           outOfBoundsCase(b, loc);
         return Value();
       });
 }

 /// Given an ArrayAttr, return a copy where the first element is dropped.
 static ArrayAttr dropFirstElem(OpBuilder &b, ArrayAttr attr) {
   if (!attr)
     return attr;
   return ArrayAttr::get(b.getContext(), attr.getValue().drop_front());
 }

 /// Add the pass label to a vector transfer op if its rank is not the target
 /// rank.
 template <typename OpTy>
 static void maybeApplyPassLabel(OpBuilder &b, OpTy newXferOp,
                                 unsigned targetRank) {
   if (newXferOp.getVectorType().getRank() > targetRank)
     newXferOp->setAttr(kPassLabel, b.getUnitAttr());
 }

 /// Return true if this transfer op operates on a source tensor.
 template <typename OpTy>
 static bool isTensorOp(OpTy xferOp) {
   if (xferOp.getShapedType().template isa<RankedTensorType>()) {
     if (xferOp.getOperationName().equals(TransferWriteOp::getOperationName())) {
       // TransferWriteOps on tensors have a result.
       assert(xferOp->getNumResults() > 0);
     }
     return true;
   }
   return false;
 }

 namespace lowering_n_d {

 /// Helper data structure for data and mask buffers.
 struct BufferAllocs {
   Value dataBuffer;
   Value maskBuffer;
 };

 /// Allocate temporary buffers for data (vector) and mask (if present).
 /// TODO: Parallelism and threadlocal considerations.
 template <typename OpTy>
 static BufferAllocs allocBuffers(OpBuilder &b, OpTy xferOp) {
   Location loc = xferOp.getLoc();
   OpBuilder::InsertionGuard guard(b);
   Operation *scope =
       xferOp->template getParentWithTrait<OpTrait::AutomaticAllocationScope>();
   assert(scope && "Expected op to be inside automatic allocation scope");
   b.setInsertionPointToStart(&scope->getRegion(0).front());

   BufferAllocs result;
   auto bufferType = MemRefType::get({}, xferOp.getVectorType());
   result.dataBuffer = b.create<memref::AllocaOp>(loc, bufferType);

   if (xferOp.mask()) {
     auto maskType = MemRefType::get({}, xferOp.mask().getType());
     auto maskBuffer = b.create<memref::AllocaOp>(loc, maskType);
     b.setInsertionPoint(xferOp);
     b.create<memref::StoreOp>(loc, xferOp.mask(), maskBuffer);
     result.maskBuffer = b.create<memref::LoadOp>(loc, maskBuffer);
   }

   return result;
 }

 /// Given a MemRefType with VectorType element type, unpack one dimension from
 /// the VectorType into the MemRefType.
 ///
 /// E.g.: memref<9xvector<5x6xf32>> --> memref<9x5xvector<6xf32>>
 static MemRefType unpackOneDim(MemRefType type) {
   auto vectorType = type.getElementType().dyn_cast<VectorType>();
   auto memrefShape = type.getShape();
   SmallVector<int64_t, 8> newMemrefShape;
   newMemrefShape.append(memrefShape.begin(), memrefShape.end());
   newMemrefShape.push_back(vectorType.getDimSize(0));
   return MemRefType::get(newMemrefShape,
                          VectorType::get(vectorType.getShape().drop_front(),
                                          vectorType.getElementType()));
 }

 /// Given a transfer op, find the memref from which the mask is loaded. This
 /// is similar to Strategy<TransferWriteOp>::getBuffer.
 template <typename OpTy>
 static Value getMaskBuffer(OpTy xferOp) {
   assert(xferOp.mask() && "Expected that transfer op has mask");
   auto loadOp = xferOp.mask().template getDefiningOp<memref::LoadOp>();
   assert(loadOp && "Expected transfer op mask produced by LoadOp");
   return loadOp.getMemRef();
 }

 /// Codegen strategy, depending on the operation.
 template <typename OpTy>
 struct Strategy;

 /// Code strategy for vector TransferReadOp.
 template <>
 struct Strategy<TransferReadOp> {
   /// Find the StoreOp that is used for writing the current TransferReadOp's
   /// result to the temporary buffer allocation.
   static memref::StoreOp getStoreOp(TransferReadOp xferOp) {
     assert(xferOp->hasOneUse() && "Expected exactly one use of TransferReadOp");
     auto storeOp = dyn_cast<memref::StoreOp>((*xferOp->use_begin()).getOwner());
     assert(storeOp && "Expected TransferReadOp result used by StoreOp");
     return storeOp;
   }

   /// Find the temporary buffer allocation. All labeled TransferReadOps are
   /// used like this, where %buf is either the buffer allocation or a type cast
   /// of the buffer allocation:
   /// ```
   /// %vec = vector.transfer_read ... { __vector_to_scf_lowering__ } ...
   /// memref.store %vec, %buf[...] ...
   /// ```
   static Value getBuffer(TransferReadOp xferOp) {
     return getStoreOp(xferOp).getMemRef();
   }

   /// Retrieve the indices of the current StoreOp that stores into the buffer.
   static void getBufferIndices(TransferReadOp xferOp,
                                SmallVector<Value, 8> &indices) {
     auto storeOp = getStoreOp(xferOp);
     auto prevIndices = memref::StoreOpAdaptor(storeOp).indices();
     indices.append(prevIndices.begin(), prevIndices.end());
   }

   /// Rewrite the TransferReadOp, assuming that there are no out-of-bounds
   /// accesses on the to-be-unpacked dimension.
   ///
   /// 1. Generate a new (N-1)-d TransferReadOp using the loop iteration
   ///    variable `iv`.
   /// 2. Store the result into the (already `vector.type_cast`ed) buffer.
   ///
   /// E.g.:
   /// ```
   /// %vec = vector.transfer_read %A[%a+%i, %b, %c], %cst
   ///     : memref<?x?x?xf32>, vector<4x3xf32>
   /// memref.store %vec, %buf[%i] : memref<5xvector<4x3xf32>>
   /// ```
   /// Is rewritten to:
   /// ```
   /// %casted = vector.type_cast %buf
   ///     : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
   /// for %j = 0 to 4 {
   ///   %vec = vector.transfer_read %A[%a+%i, %b+%j, %c], %cst
   ///       : memref<?x?x?xf32>, vector<3xf32>
   ///   memref.store %vec, %casted[%i, %j] : memref<5x4xvector<3xf32>>
   /// }
   /// ```
   ///
   /// Note: The loop and type cast are generated in TransferOpConversion.
   ///       The original TransferReadOp and store op are deleted in `cleanup`.
   /// Note: The `mask` operand is set in TransferOpConversion.
   static TransferReadOp rewriteOp(OpBuilder &b,
                                   VectorTransferToSCFOptions options,
                                   TransferReadOp xferOp, Value buffer, Value iv,
                                   ValueRange /*loopState*/) {
     SmallVector<Value, 8> storeIndices;
     getBufferIndices(xferOp, storeIndices);
     storeIndices.push_back(iv);

     SmallVector<Value, 8> xferIndices;
     getXferIndices(b, xferOp, iv, xferIndices);

     Location loc = xferOp.getLoc();
     auto bufferType = buffer.getType().dyn_cast<ShapedType>();
     auto vecType = bufferType.getElementType().dyn_cast<VectorType>();
     auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
     auto newXferOp = b.create<vector::TransferReadOp>(
         loc, vecType, xferOp.source(), xferIndices,
         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), xferOp.padding(),
         Value(), inBoundsAttr);

     maybeApplyPassLabel(b, newXferOp, options.targetRank);

     b.create<memref::StoreOp>(loc, newXferOp.vector(), buffer, storeIndices);
     return newXferOp;
   }

   /// Handle out-of-bounds accesses on the to-be-unpacked dimension: Write
   /// padding value to the temporary buffer.
   static Value handleOutOfBoundsDim(OpBuilder &b, TransferReadOp xferOp,
                                     Value buffer, Value iv,
                                     ValueRange /*loopState*/) {
     SmallVector<Value, 8> storeIndices;
     getBufferIndices(xferOp, storeIndices);
     storeIndices.push_back(iv);

     Location loc = xferOp.getLoc();
     auto bufferType = buffer.getType().dyn_cast<ShapedType>();
     auto vecType = bufferType.getElementType().dyn_cast<VectorType>();
     auto vec = b.create<SplatOp>(loc, vecType, xferOp.padding());
     b.create<memref::StoreOp>(loc, vec, buffer, storeIndices);

     return Value();
   }

   /// Cleanup after rewriting the op.
   static void cleanup(PatternRewriter &rewriter, TransferReadOp xferOp,
                       scf::ForOp /*forOp*/) {
     rewriter.eraseOp(getStoreOp(xferOp));
     rewriter.eraseOp(xferOp);
   }

   /// Return the initial loop state for the generated scf.for loop.
   static Value initialLoopState(TransferReadOp xferOp) { return Value(); }
 };

 /// Codegen strategy for vector TransferWriteOp.
 template <>
 struct Strategy<TransferWriteOp> {
   /// Find the temporary buffer allocation. All labeled TransferWriteOps are
   /// used like this, where %buf is either the buffer allocation or a type cast
   /// of the buffer allocation:
   /// ```
   /// %vec = memref.load %buf[...] ...
   /// vector.transfer_write %vec ... { __vector_to_scf_lowering__ } ...
   /// ```
   static Value getBuffer(TransferWriteOp xferOp) {
     auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
     assert(loadOp && "Expected transfer op vector produced by LoadOp");
     return loadOp.getMemRef();
   }

   /// Retrieve the indices of the current LoadOp that loads from the buffer.
   static void getBufferIndices(TransferWriteOp xferOp,
                                SmallVector<Value, 8> &indices) {
     auto loadOp = xferOp.vector().getDefiningOp<memref::LoadOp>();
     auto prevIndices = memref::LoadOpAdaptor(loadOp).indices();
     indices.append(prevIndices.begin(), prevIndices.end());
   }

   /// Rewrite the TransferWriteOp, assuming that there are no out-of-bounds
   /// accesses on the to-be-unpacked dimension.
   ///
   /// 1. Load an (N-1)-d vector from the (already `vector.type_cast`ed) buffer,
   ///    using the loop iteration variable `iv`.
   /// 2. Generate a new (N-1)-d TransferWriteOp, writing the loaded vector back
   ///    to memory.
   ///
   /// Note: For more details, see comments on Strategy<TransferReadOp>.
   static TransferWriteOp rewriteOp(OpBuilder &b,
                                    VectorTransferToSCFOptions options,
                                    TransferWriteOp xferOp, Value buffer,
                                    Value iv, ValueRange loopState) {
     SmallVector<Value, 8> loadIndices;
     getBufferIndices(xferOp, loadIndices);
     loadIndices.push_back(iv);

     SmallVector<Value, 8> xferIndices;
     getXferIndices(b, xferOp, iv, xferIndices);

     Location loc = xferOp.getLoc();
     auto vec = b.create<memref::LoadOp>(loc, buffer, loadIndices);
     auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
     auto source = loopState.empty() ? xferOp.source() : loopState[0];
     Type type = isTensorOp(xferOp) ? xferOp.getShapedType() : Type();
     auto newXferOp = b.create<vector::TransferWriteOp>(
         loc, type, vec, source, xferIndices,
         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
         inBoundsAttr);

     maybeApplyPassLabel(b, newXferOp, options.targetRank);

     return newXferOp;
   }

   /// Handle out-of-bounds accesses on the to-be-unpacked dimension.
   static Value handleOutOfBoundsDim(OpBuilder &b, TransferWriteOp xferOp,
                                     Value buffer, Value iv,
                                     ValueRange loopState) {
     return isTensorOp(xferOp) ? loopState[0] : Value();
   }

   /// Cleanup after rewriting the op.
   static void cleanup(PatternRewriter &rewriter, TransferWriteOp xferOp,
                       scf::ForOp forOp) {
     if (isTensorOp(xferOp)) {
       assert(forOp->getNumResults() == 1 && "Expected one for loop result");
       rewriter.replaceOp(xferOp, forOp->getResult(0));
     } else {
       rewriter.eraseOp(xferOp);
     }
   }

   /// Return the initial loop state for the generated scf.for loop.
   static Value initialLoopState(TransferWriteOp xferOp) {
     return isTensorOp(xferOp) ? xferOp.source() : Value();
   }
 };

 template <typename OpTy>
 LogicalResult checkPrepareXferOp(OpTy xferOp,
                                  VectorTransferToSCFOptions options) {
   if (xferOp->hasAttr(kPassLabel))
     return failure();
   if (xferOp.getVectorType().getRank() <= options.targetRank)
     return failure();
   if (isTensorOp(xferOp) && !options.lowerTensors)
     return failure();
   // Transfer ops that modify the element type are not supported atm.
   if (xferOp.getVectorType().getElementType() !=
       xferOp.getShapedType().getElementType())
     return failure();
   return success();
 }

 /// Prepare a TransferReadOp for progressive lowering.
 ///
 /// 1. Allocate a temporary buffer.
 /// 2. Label the TransferReadOp, marking it eligible for progressive lowering.
 /// 3. Store the result of the TransferReadOp into the temporary buffer.
 /// 4. Load the result from the temporary buffer and replace all uses of the
 ///    original TransferReadOp with this load.
 ///
 /// E.g.:
 /// ```
 /// %vec = vector.transfer_read %A[%a, %b, %c], %cst
 ///     : vector<5x4xf32>, memref<?x?x?xf32>
 /// ```
 /// is rewritten to:
 /// ```
 /// %0 = memref.alloca() : memref<vector<5x4xf32>>
 /// %1 = vector.transfer_read %A[%a, %b, %c], %cst
 ///     { __vector_to_scf_lowering__ } : vector<5x4xf32>, memref<?x?x?xf32>
 /// memref.store %1, %0[] : memref<vector<5x4xf32>>
 /// %vec = memref.load %0[] : memref<vector<5x4xf32>>
 /// ```
 ///
 /// Note: A second temporary buffer may be allocated for the `mask` operand.
 struct PrepareTransferReadConversion
     : public VectorToSCFPattern<TransferReadOp> {
   using VectorToSCFPattern<TransferReadOp>::VectorToSCFPattern;

   LogicalResult matchAndRewrite(TransferReadOp xferOp,
                                 PatternRewriter &rewriter) const override {
     if (checkPrepareXferOp(xferOp, options).failed())
       return failure();

     auto buffers = allocBuffers(rewriter, xferOp);
     auto *newXfer = rewriter.clone(*xferOp.getOperation());
     newXfer->setAttr(kPassLabel, rewriter.getUnitAttr());
     if (xferOp.mask()) {
       dyn_cast<TransferReadOp>(newXfer).maskMutable().assign(
           buffers.maskBuffer);
     }

     Location loc = xferOp.getLoc();
     rewriter.create<memref::StoreOp>(loc, newXfer->getResult(0),
                                      buffers.dataBuffer);
     rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffers.dataBuffer);

     return success();
   }
 };

 /// Prepare a TransferWriteOp for progressive lowering.
 ///
 /// 1. Allocate a temporary buffer.
 /// 2. Store the vector into the buffer.
 /// 3. Load the vector from the buffer again.
 /// 4. Use the loaded vector as a TransferWriteOp operand and label the op,
 ///    marking it eligible for progressive lowering via TransferOpConversion.
 ///
 /// E.g.:
 /// ```
 /// vector.transfer_write %vec, %A[%a, %b, %c]
 ///     : vector<5x4xf32>, memref<?x?x?xf32>
 /// ```
 /// is rewritten to:
 /// ```
 /// %0 = memref.alloca() : memref<vector<5x4xf32>>
 /// memref.store %vec, %0[] : memref<vector<5x4xf32>>
 /// %1 = memref.load %0[] : memref<vector<5x4xf32>>
 /// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ }
 ///     : vector<5x4xf32>, memref<?x?x?xf32>
 /// ```
 ///
 /// Note: A second temporary buffer may be allocated for the `mask` operand.
 struct PrepareTransferWriteConversion
     : public VectorToSCFPattern<TransferWriteOp> {
   using VectorToSCFPattern<TransferWriteOp>::VectorToSCFPattern;

   LogicalResult matchAndRewrite(TransferWriteOp xferOp,
                                 PatternRewriter &rewriter) const override {
     if (checkPrepareXferOp(xferOp, options).failed())
       return failure();

     Location loc = xferOp.getLoc();
     auto buffers = allocBuffers(rewriter, xferOp);
     rewriter.create<memref::StoreOp>(loc, xferOp.vector(), buffers.dataBuffer);
     auto loadedVec = rewriter.create<memref::LoadOp>(loc, buffers.dataBuffer);
     rewriter.updateRootInPlace(xferOp, [&]() {
       xferOp.vectorMutable().assign(loadedVec);
       xferOp->setAttr(kPassLabel, rewriter.getUnitAttr());
     });

     if (xferOp.mask()) {
       rewriter.updateRootInPlace(
           xferOp, [&]() { xferOp.maskMutable().assign(buffers.maskBuffer); });
     }

     return success();
   }
 };

 /// Progressive lowering of vector transfer ops: Unpack one dimension.
 ///
 /// 1. Unpack one dimension from the current buffer type and cast the buffer
 ///    to that new type. E.g.:
 ///    ```
 ///    %vec = memref.load %0[%1] : memref<5xvector<4x3xf32>>
 ///    vector.transfer_write %vec ...
 ///    ```
 ///    The following cast is generated:
 ///    ```
 ///    %casted = vector.type_cast %0
 ///        : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
 ///    ```
 /// 2. Generate a for loop and rewrite the transfer op according to the
 ///    corresponding Strategy<OpTy>. If the to-be-unpacked dimension can be
 ///    out-of-bounds, generate an if-check and handle both cases separately.
 /// 3. Clean up according to the corresponding Strategy<OpTy>.
 ///
 /// Note: If the transfer op is a TransferWriteOp and operates on a tensor
 /// source (as opposed to a memref source), then each iteration of the generated
 /// scf.for loop yields the new tensor value. E.g.:
 /// ```
 /// %result = scf.for i = 0 to 5 {
 ///   %0 = memref.load %buffer[i] : memref<5xvector<4x3xf32>>
 ///   %1 = vector.transfer_write %0, %source[...]
 ///       : vector<4x3xf32>, tensor<5x4x3xf32>
 ///   scf.yield %1 : tensor<5x4x3xf32>
 /// }
 /// ```
 template <typename OpTy>
 struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
   using VectorToSCFPattern<OpTy>::VectorToSCFPattern;

   void initialize() {
     // This pattern recursively unpacks one dimension at a time. The recursion
     // bounded as the rank is strictly decreasing.
     this->setHasBoundedRewriteRecursion();
   }

   LogicalResult matchAndRewrite(OpTy xferOp,
                                 PatternRewriter &rewriter) const override {
     if (!xferOp->hasAttr(kPassLabel))
       return failure();

     // Find and cast data buffer. How the buffer can be found depends on OpTy.
     ImplicitLocOpBuilder locB(xferOp.getLoc(), rewriter);
     auto dataBuffer = Strategy<OpTy>::getBuffer(xferOp);
     auto dataBufferType = dataBuffer.getType().template dyn_cast<MemRefType>();
     auto castedDataType = unpackOneDim(dataBufferType);
     auto castedDataBuffer =
         locB.create<vector::TypeCastOp>(castedDataType, dataBuffer);

     // If the xferOp has a mask: Find and cast mask buffer.
     Value castedMaskBuffer;
     if (xferOp.mask()) {
       auto maskBuffer = getMaskBuffer(xferOp);
       auto maskBufferType =
           maskBuffer.getType().template dyn_cast<MemRefType>();
       if (xferOp.isBroadcastDim(0) || xferOp.getMaskType().getRank() == 1) {
         // Do not unpack a dimension of the mask, if:
         // * To-be-unpacked transfer op dimension is a broadcast.
         // * Mask is 1D, i.e., the mask cannot be further unpacked.
         //   (That means that all remaining dimensions of the transfer op must
         //   be broadcasted.)
         castedMaskBuffer = maskBuffer;
       } else {
         auto castedMaskType = unpackOneDim(maskBufferType);
         castedMaskBuffer =
             locB.create<vector::TypeCastOp>(castedMaskType, maskBuffer);
       }
     }

     // Loop bounds and step.
     auto lb = locB.create<arith::ConstantIndexOp>(0);
     auto ub = locB.create<arith::ConstantIndexOp>(
         castedDataType.getDimSize(castedDataType.getRank() - 1));
     auto step = locB.create<arith::ConstantIndexOp>(1);
     // TransferWriteOps that operate on tensors return the modified tensor and
     // require a loop state.
     auto loopState = Strategy<OpTy>::initialLoopState(xferOp);

     // Generate for loop.
     auto result = locB.create<scf::ForOp>(
         lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(),
         [&](OpBuilder &b, Location loc, Value iv, ValueRange loopState) {
           Type stateType = loopState.empty() ? Type() : loopState[0].getType();

           auto result = generateInBoundsCheck(
               b, xferOp, iv, unpackedDim(xferOp),
               stateType ? TypeRange(stateType) : TypeRange(),
               /*inBoundsCase=*/
               [&](OpBuilder &b, Location loc) {
                 // Create new transfer op.
                 OpTy newXfer = Strategy<OpTy>::rewriteOp(
                     b, this->options, xferOp, castedDataBuffer, iv, loopState);

                 // If old transfer op has a mask: Set mask on new transfer op.
                 // Special case: If the mask of the old transfer op is 1D and
                 // the
                 //               unpacked dim is not a broadcast, no mask is
                 //               needed on the new transfer op.
                 if (xferOp.mask() && (xferOp.isBroadcastDim(0) ||
                                       xferOp.getMaskType().getRank() > 1)) {
                   OpBuilder::InsertionGuard guard(b);
                   b.setInsertionPoint(newXfer); // Insert load before newXfer.

                   SmallVector<Value, 8> loadIndices;
                   Strategy<OpTy>::getBufferIndices(xferOp, loadIndices);
                   // In case of broadcast: Use same indices to load from memref
                   // as before.
                   if (!xferOp.isBroadcastDim(0))
                     loadIndices.push_back(iv);

                   auto mask = b.create<memref::LoadOp>(loc, castedMaskBuffer,
                                                        loadIndices);
                   rewriter.updateRootInPlace(
                       newXfer, [&]() { newXfer.maskMutable().assign(mask); });
                 }

                 return loopState.empty() ? Value() : newXfer->getResult(0);
               },
               /*outOfBoundsCase=*/
               [&](OpBuilder &b, Location /*loc*/) {
                 return Strategy<OpTy>::handleOutOfBoundsDim(
                     b, xferOp, castedDataBuffer, iv, loopState);
               });

           maybeYieldValue(b, loc, !loopState.empty(), result);
         });

     Strategy<OpTy>::cleanup(rewriter, xferOp, result);
     return success();
   }
 };

 } // namespace lowering_n_d

 namespace lowering_n_d_unrolled {

 /// If the original transfer op has a mask, compute the mask of the new transfer
 /// op (for the current iteration `i`) and assign it.
 template <typename OpTy>
 static void maybeAssignMask(OpBuilder &b, OpTy xferOp, OpTy newXferOp,
                             int64_t i) {
   if (!xferOp.mask())
     return;

   if (xferOp.isBroadcastDim(0)) {
     // To-be-unpacked dimension is a broadcast, which does not have a
     // corresponding mask dimension. Mask attribute remains unchanged.
     newXferOp.maskMutable().assign(xferOp.mask());
     return;
   }

   if (xferOp.getMaskType().getRank() > 1) {
     // Unpack one dimension of the mask.
     OpBuilder::InsertionGuard guard(b);
     b.setInsertionPoint(newXferOp); // Insert load before newXfer.

     llvm::SmallVector<int64_t, 1> indices({i});
     Location loc = xferOp.getLoc();
     auto newMask = b.create<vector::ExtractOp>(loc, xferOp.mask(), indices);
     newXferOp.maskMutable().assign(newMask);
   }

   // If we end up here: The mask of the old transfer op is 1D and the unpacked
   // dim is not a broadcast, so no mask is needed on the new transfer op.
   // `generateInBoundsCheck` will have evaluated the mask already.
 }

 /// Progressive lowering of vector TransferReadOp with unrolling: Unpack one
 /// dimension. This is similar to TransferOpConversion<TransferReadOp>, but no
 /// memref buffer is allocated and the SCF loop is fully unrolled.
 ///
 /// ```
 /// E.g.:
 /// ```
 /// %vec = vector.transfer_read %A[%a, %b, %c], %padding
 ///     : memref<?x?x?xf32>, vector<5x4xf32>
 /// ```
 /// is rewritten to IR such as (simplified):
 /// ```
 /// %v_init = splat %padding : vector<5x4xf32>
 /// %tmp0 = vector.transfer_read %A[%a, %b, %c], %padding
 ///     : memref<?x?x?xf32>, vector<4xf32>
 /// %v0 = vector.insert %tmp0, %v_init[0] : vector<4xf32> into vector<5x4xf32>
 /// %tmp1 = vector.transfer_read %A[%a, %b + 1, %c], %padding
 ///     : memref<?x?x?xf32>, vector<4xf32>
 /// %v1 = vector.insert %tmp1, %v0[1] : vector<4xf32> into vector<5x4xf32>
 /// ...
 /// %tmp4 = vector.transfer_read %A[%a, %b + 4, %c], %padding
 ///     : memref<?x?x?xf32>, vector<4xf32>
 /// %vec = vector.insert %tmp1, %v3[4] : vector<4xf32> into vector<5x4xf32>
 /// ```
 ///
 /// Note: As an optimization, if the result of the original TransferReadOp
 /// was directly inserted into another vector, no new %v_init vector is created.
 /// Instead, the new TransferReadOp results are inserted into that vector.
 struct UnrollTransferReadConversion
     : public VectorToSCFPattern<TransferReadOp> {
   using VectorToSCFPattern<TransferReadOp>::VectorToSCFPattern;

   void initialize() {
     // This pattern recursively unpacks one dimension at a time. The recursion
     // bounded as the rank is strictly decreasing.
     setHasBoundedRewriteRecursion();
   }

   /// Return the vector into which the newly created TransferReadOp results
   /// are inserted.
   Value getResultVector(TransferReadOp xferOp,
                         PatternRewriter &rewriter) const {
     if (auto insertOp = getInsertOp(xferOp))
       return insertOp.dest();
     Location loc = xferOp.getLoc();
     return rewriter.create<SplatOp>(loc, xferOp.getVectorType(),
                                     xferOp.padding());
   }

   /// If the result of the TransferReadOp has exactly one user, which is a
   /// vector::InsertOp, return that operation.
   vector::InsertOp getInsertOp(TransferReadOp xferOp) const {
     if (xferOp->hasOneUse()) {
       Operation *xferOpUser = *xferOp->getUsers().begin();
       if (auto insertOp = dyn_cast<vector::InsertOp>(xferOpUser))
         return insertOp;
     }

     return vector::InsertOp();
   }

   /// If the result of the TransferReadOp has exactly one user, which is a
   /// vector::InsertOp, return that operation's indices.
   void getInsertionIndices(TransferReadOp xferOp,
                            SmallVector<int64_t, 8> &indices) const {
     if (auto insertOp = getInsertOp(xferOp)) {
       llvm::for_each(insertOp.position(), [&](Attribute attr) {
         indices.push_back(attr.dyn_cast<IntegerAttr>().getInt());
       });
     }
   }

   /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
   /// accesses, and broadcasts and transposes in permutation maps.
   LogicalResult matchAndRewrite(TransferReadOp xferOp,
                                 PatternRewriter &rewriter) const override {
     if (xferOp.getVectorType().getRank() <= options.targetRank)
       return failure();
     if (isTensorOp(xferOp) && !options.lowerTensors)
       return failure();
     // Transfer ops that modify the element type are not supported atm.
     if (xferOp.getVectorType().getElementType() !=
         xferOp.getShapedType().getElementType())
       return failure();

     auto insertOp = getInsertOp(xferOp);
     auto vec = getResultVector(xferOp, rewriter);
     auto vecType = vec.getType().dyn_cast<VectorType>();
     auto xferVecType = xferOp.getVectorType();
     auto newXferVecType = VectorType::get(xferVecType.getShape().drop_front(),
                                           xferVecType.getElementType());
     int64_t dimSize = xferVecType.getShape()[0];

     // Generate fully unrolled loop of transfer ops.
     Location loc = xferOp.getLoc();
     for (int64_t i = 0; i < dimSize; ++i) {
       Value iv = rewriter.create<arith::ConstantIndexOp>(loc, i);

       vec = generateInBoundsCheck(
           rewriter, xferOp, iv, unpackedDim(xferOp), TypeRange(vecType),
           /*inBoundsCase=*/
           [&](OpBuilder &b, Location loc) {
             // Indices for the new transfer op.
             SmallVector<Value, 8> xferIndices;
             getXferIndices(b, xferOp, iv, xferIndices);

             // Indices for the new vector.insert op.
             SmallVector<int64_t, 8> insertionIndices;
             getInsertionIndices(xferOp, insertionIndices);
             insertionIndices.push_back(i);

             auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
             auto newXferOp = b.create<vector::TransferReadOp>(
                 loc, newXferVecType, xferOp.source(), xferIndices,
                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)),
                 xferOp.padding(), Value(), inBoundsAttr);
             maybeAssignMask(b, xferOp, newXferOp, i);
             return b.create<vector::InsertOp>(loc, newXferOp, vec,
                                               insertionIndices);
           },
           /*outOfBoundsCase=*/
           [&](OpBuilder &b, Location loc) {
             // Loop through original (unmodified) vector.
             return vec;
           });
     }

     if (insertOp) {
       // Rewrite single user of the old TransferReadOp, which was an InsertOp.
       rewriter.replaceOp(insertOp, vec);
       rewriter.eraseOp(xferOp);
     } else {
       rewriter.replaceOp(xferOp, vec);
     }

     return success();
   }
 };

 /// Progressive lowering of vector TransferWriteOp with unrolling: Unpack one
 /// dimension. This is similar to TransferOpConversion<TransferWriteOp>, but no
 /// memref buffer is allocated and the SCF loop is fully unrolled.
 ///
 /// ```
 /// E.g.:
 /// ```
 /// vector.transfer_write %vec, %A[%a, %b, %c]
 ///     : vector<5x4xf32>, memref<?x?x?xf32>
 /// ```
 /// is rewritten to IR such as (simplified):
 /// ```
 /// %v0 = vector.extract %vec[0] : vector<5x4xf32>
 /// vector.transfer_write %v0, %A[%a, %b, %c] : vector<4xf32>, memref<...>
 /// %v1 = vector.extract %vec[1] : vector<5x4xf32>
 /// vector.transfer_write %v1, %A[%a, %b + 1, %c] : vector<4xf32>, memref<...>
 /// ...
 /// %v4 = vector.extract %vec[4] : vector<5x4xf32>
 /// vector.transfer_write %v4, %A[%a, %b + 4, %c] : vector<4xf32>, memref<...>
 /// ```
 ///
 /// Note: As an optimization, if the vector of the original TransferWriteOp
 /// was directly extracted from another vector via an ExtractOp `a`, extract
 /// the vectors for the newly generated TransferWriteOps from `a`'s input. By
 /// doing so, `a` may become dead, and the number of ExtractOps generated during
 /// recursive application of this pattern will be minimal.
 struct UnrollTransferWriteConversion
     : public VectorToSCFPattern<TransferWriteOp> {
   using VectorToSCFPattern<TransferWriteOp>::VectorToSCFPattern;

   void initialize() {
     // This pattern recursively unpacks one dimension at a time. The recursion
     // bounded as the rank is strictly decreasing.
     setHasBoundedRewriteRecursion();
   }

   /// Return the vector from which newly generated ExtracOps will extract.
   Value getDataVector(TransferWriteOp xferOp) const {
     if (auto extractOp = getExtractOp(xferOp))
       return extractOp.vector();
     return xferOp.vector();
   }

   /// If the input of the given TransferWriteOp is an ExtractOp, return it.
   vector::ExtractOp getExtractOp(TransferWriteOp xferOp) const {
     if (auto *op = xferOp.vector().getDefiningOp())
       return dyn_cast<vector::ExtractOp>(op);
     return vector::ExtractOp();
   }

   /// If the input of the given TransferWriteOp is an ExtractOp, return its
   /// indices.
   void getExtractionIndices(TransferWriteOp xferOp,
                             SmallVector<int64_t, 8> &indices) const {
     if (auto extractOp = getExtractOp(xferOp)) {
       llvm::for_each(extractOp.position(), [&](Attribute attr) {
         indices.push_back(attr.dyn_cast<IntegerAttr>().getInt());
       });
     }
   }

   /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds
   /// accesses, and broadcasts and transposes in permutation maps.
   LogicalResult matchAndRewrite(TransferWriteOp xferOp,
                                 PatternRewriter &rewriter) const override {
     if (xferOp.getVectorType().getRank() <= options.targetRank)
       return failure();
     if (isTensorOp(xferOp) && !options.lowerTensors)
       return failure();
     // Transfer ops that modify the element type are not supported atm.
     if (xferOp.getVectorType().getElementType() !=
         xferOp.getShapedType().getElementType())
       return failure();

     auto vec = getDataVector(xferOp);
     auto xferVecType = xferOp.getVectorType();
     int64_t dimSize = xferVecType.getShape()[0];
     auto source = xferOp.source(); // memref or tensor to be written to.
     auto sourceType = isTensorOp(xferOp) ? xferOp.getShapedType() : Type();

     // Generate fully unrolled loop of transfer ops.
     Location loc = xferOp.getLoc();
     for (int64_t i = 0; i < dimSize; ++i) {
       Value iv = rewriter.create<arith::ConstantIndexOp>(loc, i);

       auto updatedSource = generateInBoundsCheck(
           rewriter, xferOp, iv, unpackedDim(xferOp),
           isTensorOp(xferOp) ? TypeRange(sourceType) : TypeRange(),
           /*inBoundsCase=*/
           [&](OpBuilder &b, Location loc) {
             // Indices for the new transfer op.
             SmallVector<Value, 8> xferIndices;
             getXferIndices(b, xferOp, iv, xferIndices);

             // Indices for the new vector.extract op.
             SmallVector<int64_t, 8> extractionIndices;
             getExtractionIndices(xferOp, extractionIndices);
             extractionIndices.push_back(i);

             auto extracted =
                 b.create<vector::ExtractOp>(loc, vec, extractionIndices);
             auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr());
             auto newXferOp = b.create<vector::TransferWriteOp>(
                 loc, sourceType, extracted, source, xferIndices,
                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
                 inBoundsAttr);

             maybeAssignMask(b, xferOp, newXferOp, i);

             return isTensorOp(xferOp) ? newXferOp->getResult(0) : Value();
           },
           /*outOfBoundsCase=*/
           [&](OpBuilder &b, Location loc) {
             return isTensorOp(xferOp) ? source : Value();
           });

       if (isTensorOp(xferOp))
         source = updatedSource;
     }

     if (isTensorOp(xferOp))
       rewriter.replaceOp(xferOp, source);
     else
       rewriter.eraseOp(xferOp);

     return success();
   }
 };

 } // namespace lowering_n_d_unrolled

 namespace lowering_1_d {

 /// Compute the indices into the memref for the LoadOp/StoreOp generated as
 /// part of TransferOp1dConversion. Return the memref dimension on which
 /// the transfer is operating. A return value of None indicates a broadcast.
 template <typename OpTy>
 static Optional<int64_t>
 get1dMemrefIndices(OpBuilder &b, OpTy xferOp, Value iv,
                    SmallVector<Value, 8> &memrefIndices) {
   auto indices = xferOp.indices();
   auto map = xferOp.permutation_map();
   assert(xferOp.getTransferRank() > 0 && "unexpected 0-d transfer");

   memrefIndices.append(indices.begin(), indices.end());
   assert(map.getNumResults() == 1 &&
          "Expected 1 permutation map result for 1D transfer");
   if (auto expr = map.getResult(0).template dyn_cast<AffineDimExpr>()) {
     Location loc = xferOp.getLoc();
     auto dim = expr.getPosition();
     AffineExpr d0, d1;
     bindDims(xferOp.getContext(), d0, d1);
     Value offset = memrefIndices[dim];
     memrefIndices[dim] = makeComposedAffineApply(b, loc, d0 + d1, {offset, iv});
     return dim;
   }

   assert(xferOp.isBroadcastDim(0) &&
          "Expected AffineDimExpr or AffineConstantExpr");
   return None;
 }

 /// Codegen strategy for TransferOp1dConversion, depending on the
 /// operation.
 template <typename OpTy>
 struct Strategy1d;

 /// Codegen strategy for TransferReadOp.
 template <>
 struct Strategy1d<TransferReadOp> {
   static void generateForLoopBody(OpBuilder &b, Location loc,
                                   TransferReadOp xferOp, Value iv,
                                   ValueRange loopState) {
     SmallVector<Value, 8> indices;
     auto dim = get1dMemrefIndices(b, xferOp, iv, indices);
     auto vec = loopState[0];

     // In case of out-of-bounds access, leave `vec` as is (was initialized with
     // padding value).
     auto nextVec = generateInBoundsCheck(
         b, xferOp, iv, dim, TypeRange(xferOp.getVectorType()),
         /*inBoundsCase=*/
         [&](OpBuilder &b, Location loc) {
           Value val = b.create<memref::LoadOp>(loc, xferOp.source(), indices);
           return b.create<vector::InsertElementOp>(loc, val, vec, iv);
         },
         /*outOfBoundsCase=*/
         [&](OpBuilder & /*b*/, Location loc) { return vec; });
     b.create<scf::YieldOp>(loc, nextVec);
   }

   static Value initialLoopState(OpBuilder &b, TransferReadOp xferOp) {
     // Inititalize vector with padding value.
     Location loc = xferOp.getLoc();
     return b.create<SplatOp>(loc, xferOp.getVectorType(), xferOp.padding());
   }
 };

 /// Codegen strategy for TransferWriteOp.
 template <>
 struct Strategy1d<TransferWriteOp> {
   static void generateForLoopBody(OpBuilder &b, Location loc,
                                   TransferWriteOp xferOp, Value iv,
                                   ValueRange /*loopState*/) {
     SmallVector<Value, 8> indices;
     auto dim = get1dMemrefIndices(b, xferOp, iv, indices);

     // Nothing to do in case of out-of-bounds access.
     generateInBoundsCheck(
         b, xferOp, iv, dim,
         /*inBoundsCase=*/[&](OpBuilder &b, Location loc) {
           auto val =
               b.create<vector::ExtractElementOp>(loc, xferOp.vector(), iv);
           b.create<memref::StoreOp>(loc, val, xferOp.source(), indices);
         });
     b.create<scf::YieldOp>(loc);
   }

   static Value initialLoopState(OpBuilder &b, TransferWriteOp xferOp) {
     return Value();
   }
 };

 /// Return true if the last dimension of the MemRefType has unit stride.
 static bool isLastMemrefDimUnitStride(MemRefType type) {
   int64_t offset;
   SmallVector<int64_t, 4> strides;
   auto successStrides = getStridesAndOffset(type, strides, offset);
   return succeeded(successStrides) && (strides.empty() || strides.back() == 1);
 }

 /// Lower a 1D vector transfer op to SCF using scalar loads/stores. This is
 /// necessary in cases where a 1D vector transfer op cannot be lowered into
 /// vector load/stores due to non-unit strides or broadcasts:
 ///
 /// * Transfer dimension is not the last memref dimension
 /// * Transfer dimension is a broadcast (i.e., scalar load + broadcast)
 /// * Memref has a layout map with non-unit stride on the last dimension
 ///
 /// This pattern generates IR as follows:
 ///
 /// 1. Generate a for loop iterating over each vector element.
 /// 2. Inside the loop, generate a InsertElementOp or ExtractElementOp,
 ///    depending on OpTy.
 ///
 /// TODO: In some cases (no masking, etc.), LLVM::MatrixColumnMajorLoadOp
 ///       can be generated instead of TransferOp1dConversion. Add such a pattern
 ///       to ConvertVectorToLLVM.
 ///
 /// E.g.:
 /// ```
 /// vector.transfer_write %vec, %A[%a, %b]
 ///    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]}
 ///    : vector<9xf32>, memref<?x?xf32>
 /// ```
 /// Is rewritten to approximately the following pseudo-IR:
 /// ```
 /// for i = 0 to 9 {
 ///   %t = vector.extractelement %vec[i] : vector<9xf32>
 ///   memref.store %t, %arg0[%a + i, %b] : memref<?x?xf32>
 /// }
 /// ```
 template <typename OpTy>
 struct TransferOp1dConversion : public VectorToSCFPattern<OpTy> {
   using VectorToSCFPattern<OpTy>::VectorToSCFPattern;

   LogicalResult matchAndRewrite(OpTy xferOp,
                                 PatternRewriter &rewriter) const override {
     // TODO: support 0-d corner case.
     if (xferOp.getTransferRank() == 0)
       return failure();
     auto map = xferOp.permutation_map();
     auto memRefType = xferOp.getShapedType().template dyn_cast<MemRefType>();

     if (!memRefType)
       return failure();
     if (xferOp.getVectorType().getRank() != 1)
       return failure();
     if (map.isMinorIdentity() && isLastMemrefDimUnitStride(memRefType))
       return failure(); // Handled by ConvertVectorToLLVM

     // Loop bounds, step, state...
     Location loc = xferOp.getLoc();
     auto vecType = xferOp.getVectorType();
     auto lb = rewriter.create<arith::ConstantIndexOp>(loc, 0);
     auto ub =
         rewriter.create<arith::ConstantIndexOp>(loc, vecType.getDimSize(0));
     auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
     auto loopState = Strategy1d<OpTy>::initialLoopState(rewriter, xferOp);

     // Generate for loop.
     rewriter.replaceOpWithNewOp<scf::ForOp>(
         xferOp, lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(),
         [&](OpBuilder &b, Location loc, Value iv, ValueRange loopState) {
           Strategy1d<OpTy>::generateForLoopBody(b, loc, xferOp, iv, loopState);
         });

     return success();
   }
 };

 } // namespace lowering_1_d
 } // namespace

 namespace mlir {

 void populateVectorToSCFConversionPatterns(
     RewritePatternSet &patterns, const VectorTransferToSCFOptions &options) {
   if (options.unroll) {
     patterns.add<lowering_n_d_unrolled::UnrollTransferReadConversion,
                  lowering_n_d_unrolled::UnrollTransferWriteConversion>(
         patterns.getContext(), options);
   } else {
     patterns.add<lowering_n_d::PrepareTransferReadConversion,
                  lowering_n_d::PrepareTransferWriteConversion,
                  lowering_n_d::TransferOpConversion<TransferReadOp>,
                  lowering_n_d::TransferOpConversion<TransferWriteOp>>(
         patterns.getContext(), options);
   }

   if (options.targetRank == 1) {
     patterns.add<lowering_1_d::TransferOp1dConversion<TransferReadOp>,
                  lowering_1_d::TransferOp1dConversion<TransferWriteOp>>(
         patterns.getContext(), options);
   }
 }

 } // namespace mlir

 namespace {

 struct ConvertVectorToSCFPass
     : public ConvertVectorToSCFBase<ConvertVectorToSCFPass> {
   ConvertVectorToSCFPass() = default;
   ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
     this->fullUnroll = options.unroll;
     this->targetRank = options.targetRank;
     this->lowerPermutationMaps = options.lowerPermutationMaps;
     this->lowerTensors = options.lowerTensors;
   }

   void runOnFunction() override {
     VectorTransferToSCFOptions options;
     options.unroll = fullUnroll;
     options.targetRank = targetRank;
     options.lowerPermutationMaps = lowerPermutationMaps;
     options.lowerTensors = lowerTensors;

     // Lower permutation maps first.
     if (lowerPermutationMaps) {
       RewritePatternSet lowerTransferPatterns(getFunction().getContext());
       mlir::vector::populateVectorTransferPermutationMapLoweringPatterns(
           lowerTransferPatterns);
       (void)applyPatternsAndFoldGreedily(getFunction(),
                                          std::move(lowerTransferPatterns));
     }

     RewritePatternSet patterns(getFunction().getContext());
     populateVectorToSCFConversionPatterns(patterns, options);
     (void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
   }
 };

 } // namespace

 std::unique_ptr<Pass>
 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
   return std::make_unique<ConvertVectorToSCFPass>(options);
 }