lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp - llvm-project/mlir - Git at Google

 //===- XeGPUSubgroupDistribute.cpp - XeGPU Subgroup Distribute Pass -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"

 namespace mlir {
 namespace xegpu {
 #define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
 } // namespace xegpu
 } // namespace mlir

 #define DEBUG_TYPE "xegpu-subgroup-distribute"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

 using namespace mlir;

 static const char *const resolveSIMTTypeMismatch =
     "resolve_simt_type_mismatch"; // Attribute name for identifying
                                   // UnrelizedConversionCastOp added to resolve
                                   // SIMT type mismatches.

 namespace {

 //===----------------------------------------------------------------------===//
 // SIMT Distribution Patterns
 //===----------------------------------------------------------------------===//

 /// In certain cases, we may need to favor XeGPU specific distribution patterns
 /// over generic vector distribution patterns. In such cases, we can assign
 /// priorities to patterns.
 static constexpr unsigned regularPatternBenefit = 1;
 static constexpr unsigned highPatternBenefit = 2;

 /// Helper function to get  distributed vector type for a source vector type
 /// according to the lane_layout. We simply divide each dimension of tensor
 /// descriptor shape by corresponding lane_layout dimension. If
 /// array_length > 1, that is appended to the front of the ditributed shape.
 /// NOTE: This is the vector type that will be returned by the
 /// gpu.warp_execute_on_lane0 op.
 ///
 /// Examples:
 /// | original vector shape | lane_layout | distributed vector shape |
 /// |-----------------------|-------------|--------------------------|
 /// | 32x16                 | [1, 16]     | 32x1                     |
 /// | 32x16                 | [2, 8]      | 16x2                     |
 /// | 2x32x16               | [1, 16]     | 2x32x1                   |
 static FailureOr<VectorType>
 getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
                                 VectorType originalType) {
   if (!layout)
     return failure();
   assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
          "Expecting a valid layout.");
   SmallVector<int64_t> effectiveLaneLayout =
       layout.getEffectiveLaneLayoutAsInt();
   assert(static_cast<size_t>(originalType.getRank()) >=
              effectiveLaneLayout.size() &&
          "Rank of the original vector type should be greater or equal to the "
          "size of the lane layout to distribute the vector type.");
   SmallVector<int64_t> distributedShape(originalType.getShape());
   // Only distribute the last `laneLayout.size()` dimensions. The remaining
   // dimensions are not distributed.
   unsigned distributionStart =
       originalType.getRank() - effectiveLaneLayout.size();
   for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
     if (i < distributionStart)
       continue;

     // Check if the dimension can be distributed evenly.
     if (dim % effectiveLaneLayout[i - distributionStart] != 0)
       return failure();
     distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
   }
   return VectorType::get(distributedShape, originalType.getElementType());
 }

 /// Helper function to resolve types if the distributed type out of
 /// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
 /// Example 1:
 ///   distributed type: vector<8x1xf32>
 ///   expected type: vector<8xf32>
 ///   resolved using,
 ///   %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
 /// Example 2:
 ///   distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
 ///   expected type: xegpu.tensor_desc<8x16xf32>
 ///   resolved using,
 ///   %0 = unrealized_conversion_cast %1 :
 ///      xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
 ///      xegpu.tensor_desc<8x16xf32>
 template <typename T>
 static Value resolveDistributedTy(Value orig, T expected,
                                   PatternRewriter &rewriter) {
   // If orig and expected types are the same, return orig.
   if (orig.getType() == expected)
     return orig;
   // If orig is a vector type, create a shape cast op to reconcile the types.
   if (isa<VectorType>(orig.getType())) {
     auto castOp =
         vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig);
     return castOp.getResult();
   }
   // If orig is a tensor descriptor type, create an unrealized conversion cast
   // op to reconcile the types.
   if (isa<xegpu::TensorDescType>(orig.getType())) {
     auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(),
                                                      expected, orig);
     castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
     return castOp.getResult(0);
   }
   llvm_unreachable("Unsupported type for reconciliation");
   return orig;
 }

 /// Helper function to check if the layout is packed. Layout is packed if it is
 /// 2D and lane_data[0] != 1 (data packed from col dimension).
 static bool hasPackedLayout(xegpu::LayoutAttr layout) {
   if (layout == xegpu::LayoutAttr())
     return false;
   DenseI32ArrayAttr laneData = layout.getLaneData();
   if (!laneData || laneData.size() != 2)
     return false;
   return laneData.asArrayRef()[0] != 1;
 }

 /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
 /// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
 /// contained within a WarpExecuteOnLane0Op.
 /// Example:
 ///
 /// ```
 ///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
 ///     ...
 ///     ...
 ///     gpu.return %result: vector<8x16xf32>
 ///   }
 /// ```
 /// To
 /// ```
 ///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
 ///     %laneid = gpu.lane_id : index
 ///     %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
 ///       ...
 ///       ...
 ///       gpu.yield %result: vector<8x16xf32>
 ///     }
 ///     return %0
 ///   }
 struct MoveFuncBodyToWarpExecuteOnLane0
     : public OpRewritePattern<gpu::GPUFuncOp> {
   using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
                                 PatternRewriter &rewriter) const override {
     // If the function only contains a single void return, skip.
     if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
           return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
         }))
       return failure();
     // If the function already moved inside a warp_execute_on_lane0, skip.
     if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
           return isa<gpu::WarpExecuteOnLane0Op>(op);
         }))
       return failure();
     // Create a new function with the same signature and same attributes.
     SmallVector<Type> workgroupAttributionsTypes =
         llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
                             [](BlockArgument arg) { return arg.getType(); });
     SmallVector<Type> privateAttributionsTypes =
         llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
                             [](BlockArgument arg) { return arg.getType(); });
     auto newGpuFunc = gpu::GPUFuncOp::create(
         rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
         gpuFuncOp.getFunctionType(), workgroupAttributionsTypes,
         privateAttributionsTypes);
     newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
     // Create a WarpExecuteOnLane0Op with same arguments and results as the
     // original gpuFuncOp.
     rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
     auto laneId = gpu::LaneIdOp::create(
         rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
         /** upperBound = **/ mlir::IntegerAttr());
     ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
     auto warpOp = gpu::WarpExecuteOnLane0Op::create(
         rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
         xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(),
         newGpuFunc.getArgumentTypes());
     Block &warpBodyBlock = warpOp.getBodyRegion().front();
     // Replace the ReturnOp of the original gpu function with a YieldOp.
     auto origRetunOp =
         cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
     rewriter.setInsertionPointAfter(origRetunOp);
     gpu::YieldOp::create(rewriter, origRetunOp.getLoc(),
                          origRetunOp.getOperands());
     rewriter.eraseOp(origRetunOp);
     // Move the original function body to the WarpExecuteOnLane0Op body.
     rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
                                 warpOp.getBodyRegion().begin());
     rewriter.eraseBlock(&warpBodyBlock);
     // Insert a new ReturnOp after the WarpExecuteOnLane0Op.
     rewriter.setInsertionPointAfter(warpOp);
     gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
     rewriter.replaceOp(gpuFuncOp, newGpuFunc);
     return success();
   }
 };

 /// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
 /// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
 /// still contain the original op that will not be used by the yield op (and
 /// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
 /// arguments. Tensor descriptor shape is not distributed because it is a
 /// uniform value across all work items within the subgroup. However, the
 /// layout information is dropped in the new tensor descriptor type.
 ///
 /// Example:
 ///
 /// ```
 ///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
 ///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
 ///                   (!xegpu.tensor_desc<4x8xf32, #layout0>) {
 ///     ...
 ///     %td = xegpu.create_nd_tdesc %arg0[0, 0]
 ///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
 ///     vector.yield %td
 ///   }
 /// ```
 /// To
 /// ```
 ///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
 ///     ...
 ///     %dead = xegpu.create_nd_tdesc %arg0[0, 0]
 ///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
 ///     vector.yield %arg0, %dead
 ///   }
 ///   %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
 ///                                 -> !xegpu.tensor_desc<4x8xf32>
 ///
 /// ```
 struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand =
         getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(
           warpOp, "warp result is not a xegpu::CreateNdDesc op");
     auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
     unsigned operandIdx = operand->getOperandNumber();

     xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           descOp, "the tensor descriptor lacks layout attribute");

     SmallVector<size_t> newRetIndices;
     rewriter.setInsertionPoint(warpOp);
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
         /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);

     SmallVector<Value> newDescOperands = llvm::map_to_vector(
         newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
     rewriter.setInsertionPointAfter(newWarpOp);
     xegpu::TensorDescType distributedTensorDescTy =
         descOp.getType().dropLayouts(); // Distributed tensor descriptor type
                                         // does not contain layout info.
     Value newDescOp = xegpu::CreateNdDescOp::create(
         rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
         descOp->getAttrs());

     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the distributed type to the expected type.
     newDescOp =
         resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
     rewriter.replaceAllUsesWith(distributedVal, newDescOp);
     return success();
   }
 };

 /// Distribute a store_nd op at the end of enclosing
 /// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
 /// through the warp op interface they would be propagated as returned values.
 /// Source vector is distributed based on lane layout. Appropriate cast ops are
 /// inserted if the distributed types does not match expected xegpu SIMT types.
 ///
 /// Example:
 ///
 /// ```
 ///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
 ///   gpu.warp_execute_on_lane_0(%laneid) -> () {
 ///     ...
 ///     xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
 ///                                 !xegpu.tensor_desc<4x8xf32, #layout0>
 ///   }
 /// ```
 /// To
 /// ```
 ///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
 ///   !xegpu.tensor_desc<4x8xf32, #layout0>) {
 ///     gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32,
 ///     #layout0>
 ///   }
 ///   %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
 ///   %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
 ///   #layout0>
 ///     -> !xegpu.tensor_desc<4x8xf32>
 ///   xegpu.store_nd %0, %1: vector<4xf32>,
 ///     !xegpu.tensor_desc<4x8xf32>
 ///
 /// ```
 struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     gpu::YieldOp yield = warpOp.getTerminator();
     Operation *lastNode = yield->getPrevNode();
     auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
     if (!storeOp)
       return failure();

     int64_t offsetSize = static_cast<int64_t>(storeOp.getOffsets().size());
     if ((offsetSize != 0) || storeOp.getConstOffsetsAttr())
       return failure();

     xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
     xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           storeOp, "the source tensor descriptor lacks layout attribute");

     FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
     if (failed(distributedTypeByWarpOpOrFailure))
       return rewriter.notifyMatchFailure(storeOp,
                                          "Failed to distribute the type");
     VectorType distributedTypeByWarpOp =
         distributedTypeByWarpOpOrFailure.value();

     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp,
         /* new yielded values = */
         ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
         /* new yielded types = */
         TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
         newRetIndices);
     // Create a new store op outside the warp op with the distributed vector
     // type. Tensor descriptor is not distributed.
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newStoreOperands;

     // For the value operand, there can be a mismatch between the vector type
     // distributed by the warp op and (xegpu-specific) distributed type
     // supported by the store op. Type mismatch must be resolved using
     // appropriate cast op.
     FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
         xegpu::getDistributedVectorType(storeOp.getTensorDescType());
     if (failed(storeNdDistributedValueTyOrFailure))
       return rewriter.notifyMatchFailure(
           storeOp, "Failed to get distributed vector type for the store op");
     newStoreOperands.push_back(resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[0]),
         storeNdDistributedValueTyOrFailure.value(), rewriter));
     // For the tensor descriptor operand, the layout attribute is dropped after
     // distribution. Types needs to be resolved in this case also.
     xegpu::TensorDescType distributedTensorDescTy =
         storeOp.getTensorDescType().dropLayouts();
     newStoreOperands.push_back(
         resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
                              distributedTensorDescTy, rewriter));

     auto newStoreOp =
         xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
                                  newStoreOperands, storeOp->getAttrs());
     xegpu::removeLayoutAttrs(newStoreOp);
     rewriter.eraseOp(storeOp);
     return success();
   }
 };

 /// Distribute a load_nd op feeding into vector.yield op for the enclosing
 /// `gpu.warp_execute_on_lane_0` and put it after the warp op.
 /// The warp op will still contain the original op that will not be used by
 /// the yield op (and should be cleaned up later). The yield op will
 /// bypass the load's arguments. Only the loaded vector is distributed
 /// according to lane layout and, tensor descriptor types is not
 /// distributed. Appropriate cast ops are inserted if the distributed types does
 /// not match expected xegpu SIMT types.
 ///
 /// Example:
 ///
 /// ```
 ///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
 ///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
 ///                   (vector<4x1xf32>) {
 ///     ...
 ///     %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #layout0>
 ///     ->
 ///       vector<4x8xf32>
 ///     gpu.yield %ld
 ///   }
 /// ```
 /// To
 /// ```
 ///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
 ///   !xegpu.tensor_desc<4x8xf32, #layout0>) {
 ///     ...
 ///     %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> ->
 ///     vector<4x8xf32> gpu.yield %dead, %arg0
 ///   }
 ///   %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
 ///        #layout0> -> !xegpu.tensor_desc<4x8xf32>
 ///   %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
 ///   %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
 ///
 /// ```
 struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
       if (!isa<xegpu::LoadNdOp>(op))
         return false;
       // Make sure the same load op is the last operation in the warp op body.
       // This ensure that load op is not sinked earlier violating any barrier
       // synchronizations.
       gpu::YieldOp yield = warpOp.getTerminator();
       return yield->getPrevNode() == op;
     });

     if (!operand)
       return rewriter.notifyMatchFailure(
           warpOp, "warp result is not a xegpu::LoadNd op");

     auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();

     int64_t offsetSize = static_cast<int64_t>(loadOp.getOffsets().size());
     if ((offsetSize != 0) || loadOp.getConstOffsetsAttr())
       return failure();

     xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
     xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           loadOp, "the source tensor descriptor lacks layout attribute");

     unsigned operandIdx = operand->getOperandNumber();
     VectorType distributedTypeByWarpOp =
         cast<VectorType>(warpOp.getResult(operandIdx).getType());

     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp,
         /* new yielded values = */ loadOp.getTensorDesc(),
         /* new yielded types = */ tensorDescTy, newRetIndices);

     // Create a new load op outside the warp op with the distributed vector
     // type.
     rewriter.setInsertionPointAfter(newWarpOp);
     FailureOr<VectorType> loadNdDistValueTyOrFailure =
         xegpu::getDistributedVectorType(loadOp.getTensorDescType());
     if (failed(loadNdDistValueTyOrFailure))
       return rewriter.notifyMatchFailure(
           loadOp, "Failed to get distributed vector type for the load op");
     xegpu::TensorDescType distributedTensorDescTy =
         loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
                                                   // descriptor type does not
                                                   // contain layout info.
     auto newLoadOp = xegpu::LoadNdOp::create(
         rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
         resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]),
                              distributedTensorDescTy, rewriter),
         loadOp->getAttrs());
     xegpu::removeLayoutAttrs(newLoadOp);
     // Set the packed attribute if the layout requires it.
     newLoadOp.setPacked(hasPackedLayout(layout));
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // There can be a conflict between the vector type distributed by the
     // warp op and (xegpu-specific) distributed type supported by the load
     // op. Resolve these mismatches by inserting a cast.
     Value tyResolvedVal = resolveDistributedTy(
         newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
     rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
     return success();
   }
 };

 /// Distribute a dpas op feeding into vector.yield op for the enclosing
 /// `gpu.warp_execute_on_lane_0` and put it after the warp op.
 /// The warp op will still contain the original op that will not be used by
 /// the yield op (and should be cleaned up later). The yield op will
 /// bypass the dpas's arguments. Appropriate cast ops are inserted if the
 /// distributed types does not match expected xegpu SIMT types.
 /// Example:
 /// ```
 ///   #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
 ///   #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
 ///   #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
 ///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
 ///                   (vector<8x1xf32>) {
 ///     ...
 ///     %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
 ///       vector<8x16xf32>
 ///     gpu.yield %dpas
 ///   }
 /// ```
 /// To
 /// ```
 ///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
 ///   vector<8x1xf16>, vector<16x1xf16>) {
 ///     ...
 ///     %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
 ///       -> vector<8x16xf32>
 ///     gpu.yield %dead, %arg0, %arg1
 ///   }
 ///   %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
 ///   %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
 ///   %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
 ///     vector<8xf32>
 ///   %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
 /// ```
 struct DpasDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(warpOp,
                                          "warp result is not a xegpu::Dpas op");

     auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
     unsigned operandIdx = operand->getOperandNumber();
     std::string layoutAName = xegpu::getLayoutName(dpasOp->getOpOperand(0));
     std::string layoutBName = xegpu::getLayoutName(dpasOp->getOpOperand(1));
     std::string layoutCName = xegpu::getLayoutName(dpasOp->getOpResult(0));

     xegpu::LayoutAttr layoutA =
         dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
     xegpu::LayoutAttr layoutB =
         dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutBName);
     xegpu::LayoutAttr layoutOut =
         dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
     if (!layoutA || !layoutB || !layoutOut)
       return rewriter.notifyMatchFailure(
           dpasOp,
           "the xegpu::Dpas op lacks layout attribute for A, B or output");

     FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
     FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
     FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
     if (failed(distLhsTypeByWarpOpOrFailure) ||
         failed(distRhsTypeByWarpOpOrFailure) ||
         failed(distResultTypeByWarpOpOrFailure))
       return rewriter.notifyMatchFailure(
           dpasOp,
           "Failed to distribute the A, B or output types in xegpu::Dpas op");

     llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
                                                dpasOp.getRhs()};
     llvm::SmallVector<Type, 3> newYieldTypes{
         distLhsTypeByWarpOpOrFailure.value(),
         distRhsTypeByWarpOpOrFailure.value()};
     // Dpas acc operand is optional.
     if (dpasOp.getAcc()) {
       newYieldValues.push_back(dpasOp.getAcc());
       newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
     }
     // Create a new warp op without the dpas.
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);

     FailureOr<VectorType> expectedDistLhsTyOrFailure =
         xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
     FailureOr<VectorType> expectedDistRhsTyOrFailure =
         xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
     FailureOr<VectorType> expectedDistResultTyOrFailure =
         xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
     if (failed(expectedDistLhsTyOrFailure) ||
         failed(expectedDistRhsTyOrFailure) ||
         failed(expectedDistResultTyOrFailure))
       return rewriter.notifyMatchFailure(
           dpasOp,
           "Failed to get distributed vector type for the dpas operands.");
     // Create a new dpas op outside the warp op.
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newDpasOperands;
     SmallVector<VectorType> newDpasOperandExpectedTypes;

     // Resolve the distributed types with the original types.
     newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
     newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
     VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
     if (dpasOp.getAcc())
       newDpasOperandExpectedTypes.push_back(distributedResultTy);

     for (unsigned i = 0; i < newRetIndices.size(); i++) {
       newDpasOperands.push_back(
           resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
                                newDpasOperandExpectedTypes[i], rewriter));
     }
     auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
                                            distributedResultTy, newDpasOperands,
                                            dpasOp->getAttrs());
     xegpu::removeLayoutAttrs(newDpasOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the output type.
     Value typeResolved =
         resolveDistributedTy(newDpasOp.getResult(),
                              distResultTypeByWarpOpOrFailure.value(), rewriter);
     rewriter.replaceAllUsesWith(distributedVal, typeResolved);
     return success();
   }
 };

 /// Sink an update_nd_offset op feeding into yield op of an enclosing
 /// `gpu.warp_execute_on_lane_0` region. The warp op will still contain the
 /// original op that will not be used by the yield op (and should be cleaned
 /// up later). The yield op will bypass the updateOp's arguments. The tensor
 /// descriptor type is not distributed. Appropriate cast ops are inserted if
 /// the distributed types does not match expected xegpu SIMT types.
 /// Example:
 /// ```
 ///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
 ///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
 ///                   (!xegpu.tensor_desc<4x8xf32, #layout0>) {
 ///     ...
 ///     %update = xegpu.update_nd_offset %arg0, [%c32, %c16]:
 ///       !xegpu.tensor_desc<4x8xf32, #layout0>
 ///     gpu.yield %update
 ///   }
 ///   ...
 /// ```
 /// To
 /// ```
 ///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (
 ///     !xegpu.tensor_desc<4x8xf32, #layout0>,
 ///     !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
 ///     ...
 ///     %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]:
 ///       !xegpu.tensor_desc<4x8xf32, #layout0> gpu.yield %dead, %arg0
 ///     gpu.yield %dead, %arg0, %c32, %c16
 ///   }
 ///   %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
 ///        #layout0> -> !xegpu.tensor_desc<4x8xf32>
 ///   %1 = xegpu.update_nd_offset %0, [%r#2, %r#3]:
 ///     !xegpu.tensor_desc<4x8xf32>
 ///   ...
 /// ```
 struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand =
         getWarpResult(warpOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(
           warpOp, "warp result is not a xegpu::UpdateNdOffset op");
     auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
     unsigned operandIdx = operand->getOperandNumber();

     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(),
         newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     // new update op does not have layout attribute.
     xegpu::TensorDescType distributedTensorDescTy =
         updateOp.getTensorDescType().dropLayouts();
     SmallVector<Value> newUpdateOperands =
         llvm::map_to_vector(newRetIndices, [&](size_t i) {
           // For the tensor descriptor operand, the layout attribute is
           // dropped after distribution. Types needs to be resolved in this
           // case.
           if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
             return resolveDistributedTy(newWarpOp.getResult(i),
                                         distributedTensorDescTy, rewriter);
           }
           return newWarpOp.getResult(i);
         });
     // Create a new update op outside the warp op.
     auto newUpdateOp = xegpu::UpdateNdOffsetOp::create(
         rewriter, newWarpOp.getLoc(), distributedTensorDescTy,
         newUpdateOperands, updateOp->getAttrs());
     xegpu::removeLayoutAttrs(newUpdateOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the distributed type with the original type.
     Value typeResolved = resolveDistributedTy(
         newUpdateOp.getResult(), distributedVal.getType(), rewriter);
     rewriter.replaceAllUsesWith(distributedVal, typeResolved);
     return success();
   }
 };

 /// Distribute a prefetch_nd op at the end of enclosing
 /// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
 /// through the warp op interface they would be propagated as returned values.
 /// Tensor descriptor shape is not distributed because it is a uniform value
 /// across all work items within the subgroup. Appropriate cast ops are inserted
 /// if the distributed types does not match expected xegpu SIMT types.
 ///
 /// Example:
 ///
 /// ```
 ///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
 ///   gpu.warp_execute_on_lane_0(%laneid) -> () {
 ///     ...
 ///     xegpu.prefetch_nd %arg0 : !xegpu.tensor_desc<4x8xf32, #layout0>
 ///   }
 /// ```
 /// To
 /// ```
 ///   %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
 ///    !xegpu.tensor_desc<4x8xf32, #layout0>) {
 ///     gpu.yield %arg0: !xegpu.tensor_desc<4x8xf32, #layout0>
 ///   }
 ///   %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
 ///     #layout0> -> !xegpu.tensor_desc<4x8xf32>
 ///   xegpu.prefetch_nd %1 : !xegpu.tensor_desc<4x8xf32>
 ///
 /// ```
 struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     gpu::YieldOp yield = warpOp.getTerminator();
     Operation *lastNode = yield->getPrevNode();
     auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
     if (!prefetchOp)
       return failure();

     int64_t offsetSize = static_cast<int64_t>(prefetchOp.getOffsets().size());
     if ((offsetSize != 0) || prefetchOp.getConstOffsetsAttr())
       return failure();

     xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           prefetchOp, "the source tensor descriptor lacks layout attribute");

     SmallVector<Value, 1> newYieldValues = {prefetchOp.getTensorDesc()};
     SmallVector<Type, 1> newYieldTypes = {prefetchOp.getTensorDescType()};
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
     // Create a new prefetch op outside the warp op with updated tensor
     // descriptor type. Source tensor descriptor require type resolution.
     xegpu::TensorDescType newTensorDescTy =
         prefetchOp.getTensorDescType().dropLayouts();
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
     xegpu::PrefetchNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
                                 newPrefetchOperands, prefetchOp->getAttrs());
     xegpu::removeLayoutAttrs(prefetchOp);
     rewriter.eraseOp(prefetchOp);
     return success();
   }
 };

 /// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
 /// region. This will simply move the barrier op outside of the warp op.
 struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     gpu::YieldOp yield = warpOp.getTerminator();
     Operation *lastNode = yield->getPrevNode();
     // The last node must be a gpu::BarrierOp.
     auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
     if (!barrierOp)
       return failure();
     // Move the barrier op outside of the warp op.
     rewriter.setInsertionPointAfter(warpOp);
     gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
                            barrierOp->getResultTypes(),
                            barrierOp->getOperands(), barrierOp->getAttrs());
     rewriter.eraseOp(barrierOp);
     return success();
   }
 };

 /// Distribute a scattered store op. The offsets argument is required.
 /// Both offset and mask vectors must be 1D and have #subgroup_size elements.
 /// The layouts are fixed and implicit: one offset/mask per lane.
 /// The pass changes the offset/mask vector shapes to a
 /// single-element vector, **it is assumed that their producer will also be
 /// distributed**. The payload vector also has a fixed distribution:
 ///   no chunk size -> vector of one element.
 ///   chunk size    -> vector of the innermost dimension of the SG-payload.
 /// Example 1 (no chunk size):
 ///    %mask = producer_op : vector<16xi1>
 ///    %offset = producer_op : vector<16xindex>
 ///    xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,
 ///     memref<256xf16>, vector<16xindex>, vector<16xi1>
 /// To
 ///    %mask = producer_op : vector<1xi1>
 ///    %offset = producer_op : vector<1xindex>
 ///    xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
 ///     memref<256xf16>, vector<1xindex>, vector<1xi1>
 /// Example 2 (chunk size, same mask and offsets):
 ///    xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
 ///     vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 /// To
 ///    xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
 ///     vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
 struct StoreDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     Operation *lastNode = warpOp.getTerminator()->getPrevNode();
     auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
     if (!storeScatterOp)
       return failure();
     auto offsets = storeScatterOp.getOffsets();
     if (!offsets || !isa<VectorType>(offsets.getType()))
       return rewriter.notifyMatchFailure(
           storeScatterOp, "Store op must have a vector of offsets argument");
     VectorType offsetsTy = cast<VectorType>(offsets.getType());
     VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
     if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
       return rewriter.notifyMatchFailure(storeScatterOp,
                                          "Expected 1D offsets and mask vector");
     VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
     if (storeVecTy.getRank() > 2)
       return rewriter.notifyMatchFailure(
           storeScatterOp, "Expected at most 2D result at SG level");

     std::string layoutPayloadName =
         xegpu::getLayoutName(storeScatterOp->getOpOperand(0));
     std::string layoutOffsetsName =
         xegpu::getLayoutName(storeScatterOp->getOpOperand(2));
     std::string layoutMaskName =
         xegpu::getLayoutName(storeScatterOp->getOpOperand(3));

     xegpu::LayoutAttr layoutPayload =
         storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutPayloadName);
     xegpu::LayoutAttr layoutOffsets =
         storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
     xegpu::LayoutAttr layoutMask =
         storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);

     FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
     FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
     FailureOr<VectorType> distMaskByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
     if (failed(distStoreVecByWarpOpOrFailure) ||
         failed(distOffsetsByWarpOpOrFailure) ||
         failed(distMaskByWarpOpOrFailure)) {
       return rewriter.notifyMatchFailure(
           storeScatterOp,
           "Some vector operands have no layouts, using defaults instead.");
     }
     VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
     VectorType expectedPayloadTy = VectorType::get(
         {distPayloadTy.getNumElements()}, distPayloadTy.getElementType());

     SmallVector<size_t> newRetIndices;
     SmallVector<Value> operands = storeScatterOp->getOperands();
     SmallVector<Type> operandTypesToYield = {
         expectedPayloadTy, operands[1].getType(),
         distOffsetsByWarpOpOrFailure.value(),
         distMaskByWarpOpOrFailure.value()};

     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
     SmallVector<Value> newStoreScatterOpOperands = llvm::map_to_vector(
         newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });

     rewriter.setInsertionPointAfter(newWarpOp);
     xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
         rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
         storeScatterOp->getAttrs());
     xegpu::removeLayoutAttrs(newOp);
     rewriter.eraseOp(storeScatterOp);
     return success();
   }
 };

 /// Distribute a scattered load op. The logic and requirements are the same as
 /// for the scattered store distribution. The warpOp's payload vector is
 /// expected to be distributed by the load's result consumer.
 /// Example 1 (no chunk size):
 ///    %mask = producer_op : vector<16xi1>
 ///    %offset = producer_op : vector<16xindex>
 ///    %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
 ///    vector<16xindex>, vector<16xi1> -> vector<16xf16>
 /// To
 ///    %mask = producer_op : vector<1xi1>
 ///    %offset = producer_op : vector<1xindex>
 ///    %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
 ///     vector<1xindex>, vector<1xi1> -> vector<1xf16>
 /// Example 2 (chunk size, same mask and offsets):
 ///    %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
 ///     memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
 /// To
 ///    %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
 ///     memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
 struct LoadDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
       // Check if the yield operand that was produced by the *last* scattered
       // load op to avoid sinking it before barriers (maintain memory order).
       return isa<xegpu::LoadGatherOp>(op) &&
              warpOp.getTerminator()->getPrevNode() == op;
     });
     if (!producedByLastLoad)
       return rewriter.notifyMatchFailure(
           warpOp, "The last op is not xegpu::LoadGatherOp");

     auto loadGatherOp =
         producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
     auto offsets = loadGatherOp.getOffsets();
     if (!offsets || !isa<VectorType>(offsets.getType()) ||
         !isa<VectorType>(loadGatherOp.getMask().getType()))
       return rewriter.notifyMatchFailure(
           loadGatherOp,
           "Load op must have a vector arguments for offsets and mask");
     VectorType offsetsTy = cast<VectorType>(offsets.getType());
     VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
     if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
       return rewriter.notifyMatchFailure(loadGatherOp,
                                          "Expected 1D offsets and mask vector");
     // Assume offset and mask producers will be distributed as well.
     std::string layoutOffsetsName =
         xegpu::getLayoutName(loadGatherOp->getOpOperand(1));
     std::string layoutMaskName =
         xegpu::getLayoutName(loadGatherOp->getOpOperand(2));

     xegpu::LayoutAttr layoutOffsets =
         loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
     xegpu::LayoutAttr layoutMask =
         loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);

     FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
     FailureOr<VectorType> distMaskByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
     if (failed(distOffsetsByWarpOpOrFailure) ||
         failed(distMaskByWarpOpOrFailure)) {
       return rewriter.notifyMatchFailure(
           loadGatherOp,
           "Some vector operands have no layouts, using defaults instead.");
     }

     SmallVector<size_t> newRetIndices;
     SmallVector<Value> operands = loadGatherOp->getOperands();
     SmallVector<Type> operandTypesToYield = {
         operands[0].getType(), distOffsetsByWarpOpOrFailure.value(),
         distMaskByWarpOpOrFailure.value()};

     const unsigned operandIdx = producedByLastLoad->getOperandNumber();
     VectorType loadVecTy =
         cast<VectorType>(warpOp.getResult(operandIdx).getType());

     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypesToYield, newRetIndices);

     SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector(
         newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });

     rewriter.setInsertionPointAfter(newWarpOp);
     xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
         rewriter, newWarpOp.getLoc(), loadVecTy, newLoadGatherOperands,
         loadGatherOp->getAttrs());
     xegpu::removeLayoutAttrs(newOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     rewriter.replaceAllUsesWith(distributedVal, newOp->getResult(0));
     return success();
   }
 };

 /// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D
 /// VectorReductionOps.
 static Value lowerToVectorReductions(TypedValue<VectorType> src,
                                      TypedValue<VectorType> acc,
                                      vector::CombiningKind kind,
                                      int64_t reductionDim, Location loc,
                                      PatternRewriter &rewriter) {
   // Expecting a 2D source vector.
   assert(src.getType().getRank() == 2 && "expected a 2D source vector");
   VectorType sourceType = src.getType();
   int64_t sourceH = sourceType.getShape()[0];
   int64_t sourceW = sourceType.getShape()[1];
   int nSlices = (reductionDim == 0) ? sourceW : sourceH;
   // Create a constant vector to hold the result of the reduction.
   TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
   Value reductionResult = arith::ConstantOp::create(
       rewriter, loc, acc.getType(),
       DenseElementsAttr::get(acc.getType(), zeroAttr));
   // For each slice of the source, extract the slice vector, do a reduction
   // and, insert the reduced value back to the result vector.
   for (int i = 0; i < nSlices; ++i) {
     SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
     if (reductionDim == 1) {
       sliceOffsets = {i, 0};
       sliceSizes = {1, sourceW};
     } else {
       sliceOffsets = {0, i};
       sliceSizes = {sourceH, 1};
     }
     vector::ExtractStridedSliceOp extractOp =
         vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                               sliceSizes, {1, 1});
     int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
     Value slice = vector::ShapeCastOp::create(
         rewriter, loc,
         VectorType::get({nSliceElements}, sourceType.getElementType()),
         extractOp.getResult());
     Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
     Value reduction =
         vector::ReductionOp::create(rewriter, loc, kind, slice, accExtract);
     reductionResult =
         vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
   }
   return reductionResult;
 }

 /// This patterns distribute the `vector.multi_reduction` operation across
 /// lanes in a warp. Currently only 2D to 1D reductions are supported. Given
 /// layouts for the source and accumulator vectors,
 /// * If the reduction dimension is distributed across lanes, the reduction is
 ///   non-lane-local and the reduction is done using warp shuffles. Here we
 ///   simply rewrite the MultiDimReductionOp to a sequence of ReductionOps in
 ///   the warp op body.
 /// * If the reduction dimension is not distributed across lanes, the reduction
 ///   is lane-local. In this case, we yield the source and accumulator vectors
 ///   from the warp op and perform the lane-local reduction outside the warp op
 ///   using a sequence of ReductionOps.
 /// Example 1 (Reduction is lane-local):
 /// ```
 /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
 ///   %0 = "some_def"() : () -> (vector<16x32xf32>)
 ///   %acc = "some_def"() : () -> (vector<32xf32>)
 ///   %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<16x32xf32> to
 ///   vector<32xf32> gpu.yield %1 : vector<32xf32>
 /// }
 /// ```
 /// is lowered to:
 /// ```
 /// %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<16x1xf32>,
 /// vector<1xf32>) {
 ///   %0 = "some_def"() : () -> (vector<16x32xf32>)
 ///   %acc = "some_def"() : () -> (vector<32xf32>)
 ///   gpu.yield %0, %acc : vector<16x32xf32>, vector<32xf32>
 /// }
 /// %c = arith.constant dense<0.0> : vector<1xf32>
 /// %1 = vector.shape_cast %r#0 : vector<16x1xf32> to vector<16xf32>
 /// %2 = vector.reduction <add>, %1, %r#1 : vector<16xf32> to f32
 /// %3 = vector.insert %2, %c[0] : f32 into vector<1xf32>
 /// ```
 /// Example 2 (Reduction is non-lane-local):
 /// ```
 /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
 ///   %0 = "some_def"() : () -> (vector<2x32xf32>)
 ///   %acc = "some_def"() : () -> (vector<2xf32>)
 ///   %1 = vector.multi_reduction <add>, %0, %acc [1] : vector<2x32xf32> to
 ///   vector<2xf32>
 ///   gpu.yield %1 : vector<2xf32>
 /// }
 /// ```
 /// is lowered to:
 /// ```
 /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
 ///   %0 = "some_def"() : () -> (vector<2x32xf32>)
 ///   %acc = "some_def"() : () -> (vector<2xf32>)
 ///   %1 = arith.constant dense<0.0> : vector<2xf32>
 ///   %2 = vector.extract %0[0] : vector<32xf32> from <vector<2x32xf32>>
 ///   %3 = ("warp.reduction %2") : f32
 ///   %4 = vector.insert %3, %1[0] : f32 into vector<2xf32>
 ///   ... repeat for row 1
 ///   gpu.yield %1 : vector<2xf32>
 /// }
 struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *yieldOperand =
         getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>);
     if (!yieldOperand)
       return failure();
     auto reductionOp =
         cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
     unsigned operandNumber = yieldOperand->getOperandNumber();
     VectorType sourceType = reductionOp.getSourceVectorType();
     // Only 2D vectors are supported.
     if (sourceType.getRank() != 2)
       return rewriter.notifyMatchFailure(warpOp,
                                          "Only 2D reductions are supported.");
     ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
     // Only 1 reduction dimension supported. This also ensures that the result
     // is vector type.
     if (reductionDims.size() != 1)
       return rewriter.notifyMatchFailure(
           warpOp, "Only 1 reduction dimension is supported.");
     int64_t reductionDim = reductionDims[0];
     VectorType distributedResultType =
         cast<VectorType>(warpOp.getResult(operandNumber).getType());
     VectorType resultType = cast<VectorType>(reductionOp.getType());
     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getDistributeLayoutAttr(reductionOp.getSource());

     FailureOr<VectorType> sourceDistTypeOrFailure =
         getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
     if (failed(sourceDistTypeOrFailure))
       return rewriter.notifyMatchFailure(
           warpOp, "Failed to distribute the source vector type.");
     VectorType sourceDistType = sourceDistTypeOrFailure.value();
     // Only single dimension distribution is supported.
     bool dim0Distributed =
         sourceDistType.getShape()[0] != sourceType.getShape()[0];
     bool dim1Distributed =
         sourceDistType.getShape()[1] != sourceType.getShape()[1];
     if (dim0Distributed && dim1Distributed)
       return rewriter.notifyMatchFailure(
           warpOp, "Expecting source to be distributed in a single dimension.");
     int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
     if (sourceDistDim == -1)
       return rewriter.notifyMatchFailure(
           warpOp, "Expecting a distributed source vector.");
     bool resultDistributed =
         distributedResultType.getNumElements() < resultType.getNumElements();
     // If the lane owns all the data required for reduction (i.e. reduction is
     // fully parallel accross lanes), then each lane owns part of the result
     // (i.e. result is distributed). If the reduction require cross-lane
     // shuffling, then the result is shared among all lanes (broadcasted).
     // Therefore we expect following cases:
     //
     // | Source vector        | Reduction dim  | Result vector  |
     // |----------------------|----------------|----------------|
     // |  dim-0 distributed   |       0        | broadcasted    |
     // |  dim-0 distributed   |       1        | distributed    |
     // |  dim-1 distributed   |       0        | distributed    |
     // |  dim-1 distributed   |       1        | broadcasted    |

     bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
                                 (sourceDistDim == 1 && reductionDim == 0);
     if (isReductionLaneLocal && !resultDistributed)
       return rewriter.notifyMatchFailure(
           warpOp, "Expecting a distributed result for lane-local reduction.");

     if (!isReductionLaneLocal && resultDistributed)
       return rewriter.notifyMatchFailure(
           warpOp,
           "Expecting a broadcasted result for non-lane-local reduction.");

     // Handle lane-local reduction case. In this case we fully distribute the
     // reduction result.
     if (isReductionLaneLocal) {
       // Yield the source and acc vectors from the WarpOp.
       SmallVector<size_t> newRetIndices;
       auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
           rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
           {sourceDistType, distributedResultType}, newRetIndices);
       rewriter.setInsertionPointAfter(newWarpOp);
       Value result = lowerToVectorReductions(
           cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
           cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
           reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
       // Replace the warp op result with the final result.
       rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
       return success();
     }
     // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
     // of multiple ReductionOps. Actual distribution is done by the
     // WarpOpReduction pattern.
     rewriter.setInsertionPointAfter(reductionOp);
     Value result = lowerToVectorReductions(
         cast<TypedValue<VectorType>>(reductionOp.getSource()),
         cast<TypedValue<VectorType>>(reductionOp.getAcc()),
         reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
     // Replace the warp op result with the final result.
     rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
     return success();
   }
 };

 /// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
 /// `gpu.warp_execute_on_lane_0` region.
 struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *yieldOperand =
         getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
     if (!yieldOperand)
       return failure();
     auto shapeCastOp =
         cast<vector::ShapeCastOp>(yieldOperand->get().getDefiningOp());
     unsigned operandNumber = yieldOperand->getOperandNumber();
     auto resultDistTy =
         cast<VectorType>(warpOp.getResult(operandNumber).getType());
     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getDistributeLayoutAttr(shapeCastOp.getSource());
     xegpu::DistributeLayoutAttr resultLayout =
         xegpu::getDistributeLayoutAttr(shapeCastOp.getResult());
     if (!sourceLayout || !resultLayout)
       return rewriter.notifyMatchFailure(
           warpOp,
           "the source or result of shape_cast op lacks distribution layout");

     // For rank reducing or increasing shape_cast ops, the lower rank layout
     // must be a slice of higher rank layout.
     int64_t sourceRank = shapeCastOp.getSourceVectorType().getRank();
     int64_t resultRank = shapeCastOp.getResultVectorType().getRank();
     if (sourceRank < resultRank && !sourceLayout.isSliceOf(resultLayout))
       return rewriter.notifyMatchFailure(
           warpOp, "shape_cast is rank reducing but source layout is not a "
                   "slice of result layout");
     if (sourceRank > resultRank && !resultLayout.isSliceOf(sourceLayout))
       return rewriter.notifyMatchFailure(
           warpOp, "shape_cast is rank increasing but result layout is not a "
                   "slice of source layout");

     FailureOr<VectorType> sourceDistTypeOrFailure =
         getDistVecTypeBasedOnLaneLayout(sourceLayout,
                                         shapeCastOp.getSourceVectorType());
     if (failed(sourceDistTypeOrFailure))
       return rewriter.notifyMatchFailure(
           warpOp, "failed to get distributed vector type for source");
     VectorType sourceDistType = sourceDistTypeOrFailure.value();
     // Create a new warp op that yields the source of the shape_cast op.
     SmallVector<size_t> newRetIndices;
     auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType},
         newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     Value source = newWarpOp.getResult(newRetIndices[0]);
     // Create a new shape_cast op outside the warp op.
     Value newShapeCast = vector::ShapeCastOp::create(
         rewriter, shapeCastOp.getLoc(), resultDistTy, source);
     rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
                                 newShapeCast);
     return success();
   }
 };

 } // namespace

 namespace {
 struct XeGPUSubgroupDistributePass final
     : public xegpu::impl::XeGPUSubgroupDistributeBase<
           XeGPUSubgroupDistributePass> {
   XeGPUSubgroupDistributePass() = default;
   XeGPUSubgroupDistributePass(const XeGPUSubgroupDistributePass &other) =
       default;
   XeGPUSubgroupDistributePass(xegpu::XeGPUSubgroupDistributeOptions options)
       : XeGPUSubgroupDistributeBase(options) {}
   void runOnOperation() override;
 };
 } // namespace

 void xegpu::populateXeGPUSubgroupDistributePatterns(
     RewritePatternSet &patterns) {
   patterns
       .add<CreateNdDescDistribution, StoreNdDistribution, LoadNdDistribution,
            DpasDistribution, PrefetchNdDistribution, UpdateNdOffsetDistribution,
            GpuBarrierDistribution, VectorMultiReductionDistribution,
            LoadDistribution, StoreDistribution>(
           patterns.getContext(),
           /*pattern benefit=*/regularPatternBenefit);
   patterns.add<VectorShapeCastDistribution>(
       patterns.getContext(),
       /*pattern benefit=*/highPatternBenefit);
 }

 void XeGPUSubgroupDistributePass::runOnOperation() {
   // Step 1: Attach layouts to op operands.
   // TODO: Following assumptions are made:
   // 1) It is assumed that there are no layout conflicts.
   // 2) Any existing layout attributes attached to the operands are ignored.
   Operation *op = getOperation();
   op->walk([&](Operation *op) {
     for (OpOperand &operand : op->getOpOperands()) {
       // Layouts are needed for vector type only.
       if (!isa<VectorType>(operand.get().getType()))
         continue;

       auto layout = xegpu::getDistributeLayoutAttr(operand.get());
       if (!layout) {
         op->emitError("Could not find layout attribute for operand ")
             << operand.getOperandNumber() << " of operation " << op->getName();
         signalPassFailure();
         return;
       }
       xegpu::setDistributeLayoutAttr(operand, layout);
     }
   });
   // Step 2: Move all operations of a GPU function inside
   // gpu.warp_execute_on_lane_0 operation.
   {
     RewritePatternSet patterns(&getContext());
     patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());

     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       signalPassFailure();
       return;
     }
     // At this point, we have moved the entire function body inside the
     // warpOp. Now move any scalar uniform code outside of the warpOp (like
     // GPU index ops, scalar constants, etc.). This will simplify the
     // later lowering and avoid custom patterns for these ops.
     getOperation()->walk([&](Operation *op) {
       if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
         vector::moveScalarUniformCode(warpOp);
     });
   }
   // Step 3: Apply subgroup to workitem distribution patterns.
   RewritePatternSet patterns(&getContext());
   xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
   // distributionFn is used by vector distribution patterns to determine the
   // distributed vector type for a given vector value. In XeGPU subgroup
   // distribution context, we compute this based on lane layout.
   auto distributionFn = [](Value val) {
     VectorType vecType = dyn_cast<VectorType>(val.getType());
     int64_t vecRank = vecType ? vecType.getRank() : 0;
     if (vecRank == 0)
       return AffineMap::get(val.getContext());
     // Get the layout of the vector type.
     xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
     // If no layout is specified, assume the inner most dimension is distributed
     // for now.
     if (!layout)
       return AffineMap::getMultiDimMapWithTargets(
           vecRank, {static_cast<unsigned int>(vecRank - 1)}, val.getContext());
     SmallVector<unsigned int> distributedDims;
     for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
       if (v > 1)
         distributedDims.push_back(i);
     }
     return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
                                                 val.getContext());
   };
   // TODO: shuffleFn is not used.
   auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
                       int64_t warpSz) { return Value(); };

   auto warpReduction = [](Location loc, OpBuilder &builder, Value input,
                           vector::CombiningKind kind, uint32_t size) {
     // First reduce on a single thread to get per lane reduction value.
     Value laneVal = builder.create<vector::ReductionOp>(loc, kind, input);
     // Parallel reduction using butterfly shuffles.
     for (uint64_t i = 1; i < size; i <<= 1) {
       Value shuffled =
           builder
               .create<gpu::ShuffleOp>(loc, laneVal, i,
                                       /*width=*/size,
                                       /*mode=*/gpu::ShuffleMode::XOR)
               .getShuffleResult();
       laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
     }
     return laneVal;
   };

   if (enableSGReductions)
     vector::populateDistributeReduction(
         patterns, warpReduction,
         /*pattern benefit=*/regularPatternBenefit);

   vector::populatePropagateWarpVectorDistributionPatterns(
       patterns, distributionFn, shuffleFn,
       /*pattern benefit=*/regularPatternBenefit);
   if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
     signalPassFailure();
     return;
   }

   // Step 4: Finllay, clean up UnrealizedConversionCastOps that were inserted
   // due to tensor desc type mismatches created by using upstream distribution
   // patterns (scf.for)
   getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
     // We are only interested in UnrealizedConversionCastOps there were added
     // for resolving SIMT type mismatches.
     if (!op->getAttr(resolveSIMTTypeMismatch))
       return WalkResult::skip();

     Value input = op.getOperand(0);
     Value output = op.getResult(0);

     // Both input and output must have tensor descriptor types.
     xegpu::TensorDescType inputDescType =
         mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
     xegpu::TensorDescType outputDescType =
         mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
     assert(inputDescType && outputDescType &&
            "Unrealized conversion cast must have tensor descriptor types");

     // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
     // This occurs iside scf.for body to resolve the block argument type to
     // SIMT type.
     if (inputDescType.getLayout()) {
       auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
       if (argument) {
         argument.setType(output.getType());
         output.replaceAllUsesWith(argument);
         if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
                 argument.getOwner()->getParentOp())) {
           auto result = loopOp.getTiedLoopResult(argument);
           result.setType(output.getType());
         }
       }
     }

     // tensor_desc<shape> -> tensor_desc<shape, layout> Type of
     // conversions. This occurs at the yield op of scf.for body to go back
     // from SIMT type to original type.
     if (outputDescType.getLayout())
       output.replaceAllUsesWith(input);

     if (op->use_empty())
       op->erase();
     return WalkResult::advance();
   });
 }