lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp - llvm-project/mlir - Git at Google

 //===- XeGPUSubgroupDistribute.cpp - XeGPU Subgroup Distribute Pass -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallVectorExtras.h"

 namespace mlir {
 namespace xegpu {
 #define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
 } // namespace xegpu
 } // namespace mlir

 #define DEBUG_TYPE "xegpu-subgroup-distribute"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

 using namespace mlir;

 static const char *const resolveSIMTTypeMismatch =
     "resolve_simt_type_mismatch"; // Attribute name for identifying
                                   // UnrelizedConversionCastOp added to resolve
                                   // SIMT type mismatches.

 namespace {

 //===----------------------------------------------------------------------===//
 // SIMT Distribution Patterns
 //===----------------------------------------------------------------------===//

 /// In certain cases, we may need to favor XeGPU specific distribution patterns
 /// over generic vector distribution patterns. In such cases, we can assign
 /// priorities to patterns.
 enum PatternHierarchy : unsigned { Regular = 1, AboveRegular = 2 };

 /// Helper function to resolve types if the distributed type out of
 /// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
 /// Example 1:
 ///   distributed type: vector<8x1xf32>
 ///   expected type: vector<8xf32>
 ///   resolved using,
 ///   %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
 /// Example 2:
 ///   distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
 ///   expected type: xegpu.tensor_desc<8x16xf32>
 ///   resolved using,
 ///   %0 = unrealized_conversion_cast %1 :
 ///      xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
 ///      xegpu.tensor_desc<8x16xf32>
 template <typename T>
 static Value resolveDistributedTy(Value orig, T expected,
                                   PatternRewriter &rewriter) {
   // If orig and expected types are the same, return orig.
   if (orig.getType() == expected)
     return orig;
   // If orig is a vector type, create a shape cast op to reconcile the types.
   if (isa<VectorType>(orig.getType())) {
     auto castOp =
         vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig);
     return castOp.getResult();
   }
   // If orig is a tensor descriptor type, create an unrealized conversion cast
   // op to reconcile the types.
   if (isa<xegpu::TensorDescType>(orig.getType())) {
     auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(),
                                                      expected, orig);
     castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
     return castOp.getResult(0);
   }
   llvm_unreachable("Unsupported type for reconciliation");
   return orig;
 }

 /// Given a vector type and its distributed vector type, return the list of
 /// dimensions that are distributed.
 static SmallVector<int64_t> getDistributedDims(VectorType originalType,
                                                VectorType distributedType) {
   assert(originalType.getRank() == distributedType.getRank() &&
          "sequential and distributed vector types must have the same rank");
   SmallVector<int64_t> distributedDims;
   for (int64_t i = 0; i < originalType.getRank(); ++i) {
     if (distributedType.getDimSize(i) != originalType.getDimSize(i)) {
       distributedDims.push_back(i);
     }
   }
   return distributedDims;
 }

 /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
 /// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
 /// contained within a WarpExecuteOnLane0Op.
 /// Example:
 ///
 /// ```
 ///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
 ///     ...
 ///     ...
 ///     gpu.return %result: vector<8x16xf32>
 ///   }
 /// ```
 /// To
 /// ```
 ///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
 ///     %laneid = gpu.lane_id : index
 ///     %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
 ///       ...
 ///       ...
 ///       gpu.yield %result: vector<8x16xf32>
 ///     }
 ///     return %0
 ///   }
 struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
   using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
                                 PatternRewriter &rewriter) const override {
     auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
     if (!uArch)
       return rewriter.notifyMatchFailure(
           gpuFuncOp, "Subgroup distribution requires target attribute attached "
                      "to set the warp size");
     // If the function only contains a single void return, skip.
     if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
           return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
         }))
       return failure();
     // If the function already moved inside a warp_execute_on_lane0, skip.
     if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
           return isa<gpu::WarpExecuteOnLane0Op>(op);
         }))
       return failure();
     // Create a new function with the same signature and same attributes.
     SmallVector<Type> workgroupAttributionsTypes =
         llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
                             [](BlockArgument arg) { return arg.getType(); });
     SmallVector<Type> privateAttributionsTypes =
         llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
                             [](BlockArgument arg) { return arg.getType(); });
     auto newGpuFunc = gpu::GPUFuncOp::create(
         rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
         gpuFuncOp.getFunctionType(), workgroupAttributionsTypes,
         privateAttributionsTypes);
     newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
     // Create a WarpExecuteOnLane0Op with same arguments and results as the
     // original gpuFuncOp.
     rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
     auto laneId = gpu::LaneIdOp::create(
         rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
         /** upperBound = **/ mlir::IntegerAttr());
     ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
     auto warpOp = gpu::WarpExecuteOnLane0Op::create(
         rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
         uArch->getSubgroupSize(), newGpuFunc.getArguments(),
         newGpuFunc.getArgumentTypes());
     Block &warpBodyBlock = warpOp.getBodyRegion().front();
     // Replace the ReturnOp of the original gpu function with a YieldOp.
     auto origRetunOp =
         cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
     rewriter.setInsertionPointAfter(origRetunOp);
     gpu::YieldOp::create(rewriter, origRetunOp.getLoc(),
                          origRetunOp.getOperands());
     rewriter.eraseOp(origRetunOp);
     // Move the original function body to the WarpExecuteOnLane0Op body.
     rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
                                 warpOp.getBodyRegion().begin());
     rewriter.eraseBlock(&warpBodyBlock);
     // Insert a new ReturnOp after the WarpExecuteOnLane0Op.
     rewriter.setInsertionPointAfter(warpOp);
     gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
     rewriter.replaceOp(gpuFuncOp, newGpuFunc);
     return success();
   }
 };

 /// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
 /// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
 /// still contain the original op that will not be used by the yield op (and
 /// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
 /// arguments. Tensor descriptor shape is not distributed because it is a
 /// uniform value across all work items within the subgroup. However, the
 /// layout information is dropped in the new tensor descriptor type.
 ///
 /// Example:
 ///
 /// ```
 ///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
 ///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
 ///                   (!xegpu.tensor_desc<4x8xf32, #layout0>) {
 ///     ...
 ///     %td = xegpu.create_nd_tdesc %arg0
 ///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
 ///     vector.yield %td
 ///   }
 /// ```
 /// To
 /// ```
 ///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
 ///     ...
 ///     %dead = xegpu.create_nd_tdesc %arg0
 ///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
 ///     vector.yield %arg0, %dead
 ///   }
 ///   %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32>
 ///                                 -> !xegpu.tensor_desc<4x8xf32>
 ///
 /// ```
 struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand =
         getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(
           warpOp, "warp result is not a xegpu::CreateNdDesc op");
     auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
     unsigned operandIdx = operand->getOperandNumber();

     xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           descOp, "the tensor descriptor lacks layout attribute");
     // CreateNdOp must not have offsets.
     if (descOp.getMixedOffsets().size())
       return rewriter.notifyMatchFailure(
           descOp, "xegpu::CreateNdDescOp must not have offsets");

     SmallVector<size_t> newRetIndices;
     rewriter.setInsertionPoint(warpOp);
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
         /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);

     SmallVector<Value> newDescOperands = llvm::map_to_vector(
         newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
     rewriter.setInsertionPointAfter(newWarpOp);
     xegpu::TensorDescType distributedTensorDescTy =
         descOp.getType().dropLayouts(); // Distributed tensor descriptor type
                                         // does not contain layout info.
     Value newDescOp = xegpu::CreateNdDescOp::create(
         rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
         descOp->getAttrs());

     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the distributed type to the expected type.
     newDescOp =
         resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
     rewriter.replaceAllUsesWith(distributedVal, newDescOp);
     return success();
   }
 };

 /// Distribute a store_nd op at the end of enclosing
 /// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
 /// through the warp op interface they would be propagated as returned values.
 /// Source vector is distributed based on lane layout. Appropriate cast ops are
 /// inserted if the distributed types does not match expected xegpu SIMT types.
 ///
 /// Example:
 ///
 /// ```
 ///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
 ///   gpu.warp_execute_on_lane_0(%laneid) -> () {
 ///     ...
 ///     xegpu.store_nd %arg0, %arg1 [%x, %y]: vector<4x8xf32>,
 ///                                 !xegpu.tensor_desc<4x8xf32, #layout0>
 ///   }
 /// ```
 /// To
 /// ```
 ///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
 ///   !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
 ///     ...
 ///     gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>,
 ///     !xegpu.tensor_desc<4x8xf32, #layout0>, index, index
 ///   }
 ///   %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
 ///   %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
 ///   #layout0>
 ///     -> !xegpu.tensor_desc<4x8xf32>
 ///   xegpu.store_nd %0, %1 [%r#2, %r#3]: vector<4xf32>,
 ///     !xegpu.tensor_desc<4x8xf32>
 ///
 /// ```
 struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     gpu::YieldOp yield = warpOp.getTerminator();
     Operation *lastNode = yield->getPrevNode();
     auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
     if (!storeOp)
       return failure();

     SmallVector<OpFoldResult> offsets = storeOp.getMixedOffsets();
     // Expecting offsets to be present.
     if (offsets.empty())
       return rewriter.notifyMatchFailure(storeOp,
                                          "the store op must have offsets");
     SmallVector<Value> offsetsAsValues =
         vector::getAsValues(rewriter, storeOp.getLoc(), offsets);
     SmallVector<Type> offsetTypes = llvm::map_to_vector(
         offsetsAsValues, [](Value v) { return v.getType(); });
     xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
     xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           storeOp, "the source tensor descriptor lacks layout attribute");

     FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
         xegpu::getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
     if (failed(distributedTypeByWarpOpOrFailure))
       return rewriter.notifyMatchFailure(storeOp,
                                          "Failed to distribute the type");
     VectorType distributedTypeByWarpOp =
         distributedTypeByWarpOpOrFailure.value();

     SmallVector<size_t> newRetIndices;
     SmallVector<Value> newYieldedValues = {storeOp.getValue(),
                                            storeOp.getTensorDesc()};
     SmallVector<Type> newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy};
     newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
     newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
     // Create a new store op outside the warp op with the distributed vector
     // type. Tensor descriptor is not distributed.
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newStoreOperands;

     // For the value operand, there can be a mismatch between the vector type
     // distributed by the warp op and (xegpu-specific) distributed type
     // supported by the store op. Type mismatch must be resolved using
     // appropriate cast op.
     FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
         xegpu::getDistributedVectorType(storeOp.getTensorDescType());
     if (failed(storeNdDistributedValueTyOrFailure))
       return rewriter.notifyMatchFailure(
           storeOp, "Failed to get distributed vector type for the store op");
     newStoreOperands.push_back(resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[0]),
         storeNdDistributedValueTyOrFailure.value(), rewriter));
     // For the tensor descriptor operand, the layout attribute is dropped after
     // distribution. Types needs to be resolved in this case also.
     xegpu::TensorDescType distributedTensorDescTy =
         storeOp.getTensorDescType().dropLayouts();
     newStoreOperands.push_back(
         resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
                              distributedTensorDescTy, rewriter));
     // Collect offsets.
     for (size_t i = 2; i < newRetIndices.size(); ++i)
       newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i]));

     auto newStoreOp =
         xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
                                  newStoreOperands, storeOp->getAttrs());
     xegpu::removeLayoutAttrs(newStoreOp);
     rewriter.eraseOp(storeOp);
     return success();
   }
 };

 /// Distribute a load_nd op feeding into vector.yield op for the enclosing
 /// `gpu.warp_execute_on_lane_0` and put it after the warp op.
 /// The warp op will still contain the original op that will not be used by
 /// the yield op (and should be cleaned up later). The yield op will
 /// bypass the load's arguments. Only the loaded vector is distributed
 /// according to lane layout and, tensor descriptor types is not
 /// distributed. Appropriate cast ops are inserted if the distributed types does
 /// not match expected xegpu SIMT types.
 ///
 /// Example:
 ///
 /// ```
 ///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
 ///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
 ///                   (vector<4x1xf32>) {
 ///     ...
 ///     %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #layout0>
 ///     ->
 ///       vector<4x8xf32>
 ///     gpu.yield %ld
 ///   }
 /// ```
 /// To
 /// ```
 ///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
 ///   !xegpu.tensor_desc<4x8xf32, #layout0>) {
 ///     ...
 ///     %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> ->
 ///     vector<4x8xf32> gpu.yield %dead, %arg0
 ///   }
 ///   %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
 ///        #layout0> -> !xegpu.tensor_desc<4x8xf32>
 ///   %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
 ///   %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
 ///
 /// ```
 struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
       if (!isa<xegpu::LoadNdOp>(op))
         return false;
       // Make sure the same load op is the last operation in the warp op body.
       // This ensure that load op is not sinked earlier violating any barrier
       // synchronizations.
       gpu::YieldOp yield = warpOp.getTerminator();
       return yield->getPrevNode() == op;
     });

     if (!operand)
       return rewriter.notifyMatchFailure(
           warpOp, "warp result is not a xegpu::LoadNd op");

     auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
     auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
     if (!uArch)
       return rewriter.notifyMatchFailure(
           loadOp, "xegpu::LoadNdOp require target attribute attached to "
                   "determine transpose "
                   "requirement");
     // Chip information is required to decide if the layout requires transpose
     // effect.
     // Expecting offsets to be present.
     SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
     if (offsets.empty())
       return rewriter.notifyMatchFailure(loadOp,
                                          "the load op must have offsets");
     SmallVector<Value> offsetsAsValues =
         vector::getAsValues(rewriter, loadOp.getLoc(), offsets);
     SmallVector<Type> offsetTypes = llvm::map_to_vector(
         offsetsAsValues, [](Value v) { return v.getType(); });

     xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
     xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           loadOp, "the source tensor descriptor lacks layout attribute");

     unsigned operandIdx = operand->getOperandNumber();
     VectorType distributedTypeByWarpOp =
         cast<VectorType>(warpOp.getResult(operandIdx).getType());

     SmallVector<size_t> newRetIndices;
     SmallVector<Value> newYieldedValues = {loadOp.getTensorDesc()};
     SmallVector<Type> newYieldedTypes = {tensorDescTy};
     newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
     newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);

     // Create a new load op outside the warp op with the distributed vector
     // type.
     rewriter.setInsertionPointAfter(newWarpOp);
     FailureOr<VectorType> loadNdDistValueTyOrFailure =
         xegpu::getDistributedVectorType(loadOp.getTensorDescType());
     if (failed(loadNdDistValueTyOrFailure))
       return rewriter.notifyMatchFailure(
           loadOp, "Failed to get distributed vector type for the load op");
     xegpu::TensorDescType distributedTensorDescTy =
         loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
                                                   // descriptor type does not
                                                   // contain layout info.
     SmallVector<Value> newLoadOperands{
         resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]),
                              distributedTensorDescTy, rewriter)};
     // Collect offsets.
     for (size_t i = 1; i < newRetIndices.size(); ++i)
       newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
     auto newLoadOp = xegpu::LoadNdOp::create(
         rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
         newLoadOperands, loadOp->getAttrs());
     xegpu::removeLayoutAttrs(newLoadOp);
     // Set the packed attribute if the layout requires it.
     newLoadOp.setPacked(xegpu::requirePacked(layout));
     // Set the transpose attribute if the layout requires it.
     if (xegpu::requireTranspose(layout, uArch))
       newLoadOp.setTranspose(
           DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // There can be a conflict between the vector type distributed by the
     // warp op and (xegpu-specific) distributed type supported by the load
     // op. Resolve these mismatches by inserting a cast.
     Value tyResolvedVal = resolveDistributedTy(
         newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
     rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
     return success();
   }
 };

 /// Distribute a dpas op feeding into vector.yield op for the enclosing
 /// `gpu.warp_execute_on_lane_0` and put it after the warp op.
 /// The warp op will still contain the original op that will not be used by
 /// the yield op (and should be cleaned up later). The yield op will
 /// bypass the dpas's arguments. Appropriate cast ops are inserted if the
 /// distributed types does not match expected xegpu SIMT types.
 /// Example:
 /// ```
 ///   #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
 ///   #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
 ///   #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
 ///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
 ///                   (vector<8x1xf32>) {
 ///     ...
 ///     %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
 ///       vector<8x16xf32>
 ///     gpu.yield %dpas
 ///   }
 /// ```
 /// To
 /// ```
 ///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
 ///   vector<8x1xf16>, vector<16x1xf16>) {
 ///     ...
 ///     %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
 ///       -> vector<8x16xf32>
 ///     gpu.yield %dead, %arg0, %arg1
 ///   }
 ///   %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
 ///   %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
 ///   %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
 ///     vector<8xf32>
 ///   %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
 /// ```
 struct DpasDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(warpOp,
                                          "warp result is not a xegpu::Dpas op");

     auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
     unsigned operandIdx = operand->getOperandNumber();

     xegpu::LayoutAttr layoutA =
         dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutAAttr());
     xegpu::LayoutAttr layoutB =
         dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutBAttr());
     xegpu::LayoutAttr layoutOut =
         dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutCdAttr());

     if (!layoutA || !layoutB || !layoutOut)
       return rewriter.notifyMatchFailure(
           dpasOp,
           "the xegpu::Dpas op lacks layout attribute for A, B or output");

     FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
     FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
     FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());

     if (failed(distLhsTypeByWarpOpOrFailure) ||
         failed(distRhsTypeByWarpOpOrFailure) ||
         failed(distResultTypeByWarpOpOrFailure))
       return rewriter.notifyMatchFailure(
           dpasOp,
           "Failed to distribute the A, B or output types in xegpu::Dpas op");

     llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
                                                dpasOp.getRhs()};
     llvm::SmallVector<Type, 3> newYieldTypes{
         distLhsTypeByWarpOpOrFailure.value(),
         distRhsTypeByWarpOpOrFailure.value()};
     // Dpas acc operand is optional.
     if (dpasOp.getAcc()) {
       newYieldValues.push_back(dpasOp.getAcc());
       newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
     }
     // Create a new warp op without the dpas.
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);

     FailureOr<VectorType> expectedDistLhsTyOrFailure =
         xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
     FailureOr<VectorType> expectedDistRhsTyOrFailure =
         xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
     FailureOr<VectorType> expectedDistResultTyOrFailure =
         xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);

     if (failed(expectedDistLhsTyOrFailure) ||
         failed(expectedDistRhsTyOrFailure) ||
         failed(expectedDistResultTyOrFailure))
       return rewriter.notifyMatchFailure(
           dpasOp,
           "Failed to get distributed vector type for the dpas operands.");
     // Create a new dpas op outside the warp op.
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newDpasOperands;
     SmallVector<VectorType> newDpasOperandExpectedTypes;

     // Resolve the distributed types with the original types.
     newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
     newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
     VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
     if (dpasOp.getAcc())
       newDpasOperandExpectedTypes.push_back(distributedResultTy);

     for (unsigned i = 0; i < newRetIndices.size(); i++) {
       newDpasOperands.push_back(
           resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
                                newDpasOperandExpectedTypes[i], rewriter));
     }
     auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
                                            distributedResultTy, newDpasOperands,
                                            dpasOp->getAttrs());
     xegpu::removeLayoutAttrs(newDpasOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the output type.
     Value typeResolved =
         resolveDistributedTy(newDpasOp.getResult(),
                              distResultTypeByWarpOpOrFailure.value(), rewriter);
     rewriter.replaceAllUsesWith(distributedVal, typeResolved);
     return success();
   }
 };

 /// Distribute a prefetch_nd op at the end of enclosing
 /// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
 /// through the warp op interface they would be propagated as returned values.
 /// Tensor descriptor shape is not distributed because it is a uniform value
 /// across all work items within the subgroup. Appropriate cast ops are inserted
 /// if the distributed types does not match expected xegpu SIMT types.
 ///
 /// Example:
 ///
 /// ```
 ///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
 ///   gpu.warp_execute_on_lane_0(%laneid) -> () {
 ///     ...
 ///     xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0>
 ///   }
 /// ```
 /// To
 /// ```
 ///   %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
 ///    !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
 ///     gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index,
 ///     index
 ///   }
 ///   %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
 ///     #layout0> -> !xegpu.tensor_desc<4x8xf32>
 ///   xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32>
 ///
 /// ```
 struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     gpu::YieldOp yield = warpOp.getTerminator();
     Operation *lastNode = yield->getPrevNode();
     auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
     if (!prefetchOp)
       return failure();

     SmallVector<OpFoldResult> offsets = prefetchOp.getMixedOffsets();
     // PrefetchNdOp must have offsets.
     if (offsets.empty())
       return rewriter.notifyMatchFailure(prefetchOp,
                                          "the prefetch op must have offsets");
     SmallVector<Value> offsetsAsValues =
         vector::getAsValues(rewriter, prefetchOp.getLoc(), offsets);
     SmallVector<Type> offsetTypes = llvm::map_to_vector(
         offsetsAsValues, [](Value v) { return v.getType(); });

     xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           prefetchOp, "the source tensor descriptor lacks layout attribute");

     SmallVector<Value> newYieldValues = {prefetchOp.getTensorDesc()};
     SmallVector<Type> newYieldTypes = {prefetchOp.getTensorDescType()};
     newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
     newYieldTypes.append(offsetTypes.begin(), offsetTypes.end());
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
     // Create a new prefetch op outside the warp op with updated tensor
     // descriptor type. Source tensor descriptor require type resolution.
     xegpu::TensorDescType newTensorDescTy =
         prefetchOp.getTensorDescType().dropLayouts();
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
     // Collect offsets.
     for (size_t i = 1; i < newRetIndices.size(); ++i)
       newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
     Operation *newPrefetchOp = xegpu::PrefetchNdOp::create(
         rewriter, newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands,
         prefetchOp->getAttrs());
     xegpu::removeLayoutAttrs(newPrefetchOp);
     rewriter.eraseOp(prefetchOp);
     return success();
   }
 };

 /// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
 /// region. This will simply move the barrier op outside of the warp op.
 struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     gpu::YieldOp yield = warpOp.getTerminator();
     Operation *lastNode = yield->getPrevNode();
     // The last node must be a gpu::BarrierOp.
     auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
     if (!barrierOp)
       return failure();
     // Move the barrier op outside of the warp op.
     rewriter.setInsertionPointAfter(warpOp);
     gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
                            barrierOp->getResultTypes(),
                            barrierOp->getOperands(), barrierOp->getAttrs());
     rewriter.eraseOp(barrierOp);
     return success();
   }
 };

 /// Distribute a scattered store op. The offsets argument is required.
 /// Both offset and mask vectors must be 1D and have #subgroup_size elements.
 /// The layouts are fixed and implicit: one offset/mask per lane.
 /// The pass changes the offset/mask vector shapes to a
 /// single-element vector, **it is assumed that their producer will also be
 /// distributed**. The payload vector also has a fixed distribution:
 ///   no chunk size -> vector of one element.
 ///   chunk size    -> vector of the innermost dimension of the SG-payload.
 /// Example 1 (no chunk size):
 ///    %mask = producer_op : vector<16xi1>
 ///    %offset = producer_op : vector<16xindex>
 ///    xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,
 ///     memref<256xf16>, vector<16xindex>, vector<16xi1>
 /// To
 ///    %mask = producer_op : vector<1xi1>
 ///    %offset = producer_op : vector<1xindex>
 ///    xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
 ///     memref<256xf16>, vector<1xindex>, vector<1xi1>
 /// Example 2 (chunk size, same mask and offsets):
 ///    xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
 ///     vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 /// To
 ///    xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
 ///     vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
 ///
 /// Note that the store distribution pattern also handles leading unit
 /// dimensions in the payload, mask and offsets vectors. In this case the store
 /// distribution will only change the dimensions corresponding to the SG
 /// distribution and keep the leading unit dimensions unchanged.
 /// For example, a store with payload vector<1x16xf16> with lane layout [1, 16 ]
 /// will be distributed as vector<1x1xf16>. Shapecast ops are inserted for the
 /// offset/mask/payload when necessary so that the distributed store is workign
 /// on 1D shape vector to match the HW capability.
 struct StoreDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     Operation *lastNode = warpOp.getTerminator()->getPrevNode();
     auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
     if (!storeScatterOp)
       return failure();
     auto offsets = storeScatterOp.getOffsets();
     if (!offsets || !isa<VectorType>(offsets.getType()))
       return rewriter.notifyMatchFailure(
           storeScatterOp, "Store op must have a vector of offsets argument");
     VectorType offsetsTy = cast<VectorType>(offsets.getType());
     VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
     VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());

     // Add handling for leading unit dimensions support
     int chunkSize = storeScatterOp.getChunkSize().value_or(1);
     int effectiveVecRank = (chunkSize == 1) ? 1 : 2;

     // Check that all leading dimensions are unit dimensions
     for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
       if (storeVecTy.getShape()[i] != 1) {
         return rewriter.notifyMatchFailure(
             storeScatterOp, "Only unit dimensions allowed for the leading "
                             "dimensions of the store vector!");
       }
     }

     auto layoutPayload =
         xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(0));
     auto layoutOffsets =
         xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2));
     auto layoutMask =
         xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3));

     FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
     FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
     FailureOr<VectorType> distMaskByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
     if (failed(distStoreVecByWarpOpOrFailure) ||
         failed(distOffsetsByWarpOpOrFailure) ||
         failed(distMaskByWarpOpOrFailure)) {
       return rewriter.notifyMatchFailure(
           storeScatterOp,
           "Some vector operands have no layouts, using defaults instead.");
     }

     VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
     VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
     VectorType distMaskTy = distMaskByWarpOpOrFailure.value();

     SmallVector<size_t> newRetIndices;
     SmallVector<Value> operands = storeScatterOp->getOperands();
     SmallVector<Type> operandTypesToYield = {
         distPayloadTy, operands[1].getType(), distOffsetsTy, distMaskTy};

     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypesToYield, newRetIndices);

     rewriter.setInsertionPointAfter(newWarpOp);

     // Distributed store payload type is always 1D without leading unit dims
     VectorType payloadTy1D = VectorType::get({distPayloadTy.getNumElements()},
                                              distPayloadTy.getElementType());

     VectorType distOffsetsTy1D = VectorType::get(
         {distOffsetsTy.getNumElements()}, distOffsetsTy.getElementType());
     VectorType distMaskTy1D = VectorType::get({distMaskTy.getNumElements()},
                                               distMaskTy.getElementType());

     // Resolve distributed types to 1D for SIMT execution
     Value distPayloadVal = resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[0]), payloadTy1D, rewriter);
     Value distOffsetVal = resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[2]), distOffsetsTy1D, rewriter);
     Value distMaskVal = resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[3]), distMaskTy1D, rewriter);

     SmallVector<Value> newStoreScatterOpOperands = {
         distPayloadVal, newWarpOp.getResult(newRetIndices[1]), distOffsetVal,
         distMaskVal};

     xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
         rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
         storeScatterOp->getAttrs());
     xegpu::removeLayoutAttrs(newOp);
     rewriter.eraseOp(storeScatterOp);
     return success();
   }
 };

 static SmallVector<Value> computeDistributedCoordinatesForMatrixOp(
     PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout,
     Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) {
   SmallVector<Value> newCoods;
   auto maybeCoords =
       layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
   if (failed(maybeCoords))
     return {};
   assert(maybeCoords.value().size() == 1 &&
          "Expected one set of distributed offsets");
   SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
       rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
       getAsOpFoldResult(origOffsets));
   newCoods = llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
   return newCoods;
 }

 /// Pattern for distributing xegpu::LoadMatrixOp.
 struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     gpu::YieldOp yield = warpOp.getTerminator();
     Operation *lastNode = yield->getPrevNode();
     auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
     if (!matrixOp)
       return failure();

     OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
       return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
     });
     if (!producedByLastLoad)
       return rewriter.notifyMatchFailure(
           warpOp, "The last op is not xegpu::LoadMatrixOp");
     const int operandIdx = producedByLastLoad->getOperandNumber();

     VectorType sgPayloadTy =
         dyn_cast<VectorType>(matrixOp.getResult().getType());
     VectorType warpResultTy =
         cast<VectorType>(warpOp.getResult(operandIdx).getType());
     if (!sgPayloadTy)
       return rewriter.notifyMatchFailure(
           matrixOp, "the matrix op payload must be a vector type");

     auto loc = matrixOp.getLoc();
     auto offsets = matrixOp.getMixedOffsets();
     if (offsets.empty())
       return rewriter.notifyMatchFailure(matrixOp,
                                          "the load op must have offsets");
     SmallVector<Value> offsetsAsValues =
         vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);

     auto layout = matrixOp.getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           matrixOp, "the matrix operation lacks layout attribute");

     FailureOr<VectorType> distPayloadByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
     if (failed(distPayloadByWarpOpOrFailure))
       return rewriter.notifyMatchFailure(
           matrixOp, "Failed to distribute matrix op payload based on layout.");

     SmallVector<Value> operands = {matrixOp.getMemDesc()};
     const unsigned offsetsStartIdx = operands.size();
     operands.append(offsetsAsValues);

     SmallVector<Type> operandTypes =
         llvm::map_to_vector(operands, [](Value v) { return v.getType(); });

     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypes, newRetIndices);
     SmallVector<Value> newOperands = llvm::map_to_vector(
         newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });

     SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
                                          ShapedType::kDynamic);
     DenseI64ArrayAttr newConstOffsetsAttr =
         rewriter.getDenseI64ArrayAttr(newConstOffsets);
     ValueRange currentOffsets =
         ValueRange(newOperands).drop_front(offsetsStartIdx);

     SmallVector<Value> newCoords = currentOffsets;
     rewriter.setInsertionPointAfter(newWarpOp);

     if (!matrixOp.getSubgroupBlockIoAttr()) {
       newCoords = computeDistributedCoordinatesForMatrixOp(
           rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
           currentOffsets);
     }
     xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
         rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
         newOperands[0], ValueRange(newCoords), newConstOffsetsAttr,
         matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
     // Resolve the output type and replace all uses.
     rewriter.replaceAllUsesWith(
         newWarpOp.getResult(operandIdx),
         resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
     return success();
   }
 };

 /// Pattern for distributing xegpu::StoreMatrixOp.
 struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     gpu::YieldOp yield = warpOp.getTerminator();
     Operation *lastNode = yield->getPrevNode();
     auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
     if (!matrixOp)
       return failure();

     VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
     if (!sgPayloadTy)
       return rewriter.notifyMatchFailure(
           matrixOp, "the matrix op payload must be a vector type");

     auto loc = matrixOp.getLoc();
     auto offsets = matrixOp.getMixedOffsets();
     if (offsets.empty())
       return rewriter.notifyMatchFailure(matrixOp,
                                          "the store op must have offsets");
     SmallVector<Value> offsetsAsValues =
         vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);

     auto layout = matrixOp.getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           matrixOp, "the matrix operation lacks layout attribute");

     FailureOr<VectorType> distPayloadByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
     if (failed(distPayloadByWarpOpOrFailure))
       return rewriter.notifyMatchFailure(
           matrixOp, "Failed to distribute matrix op payload based on layout.");

     SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
     const unsigned offsetsStartIdx = operands.size();
     operands.append(offsetsAsValues);

     SmallVector<Type> operandTypes =
         llvm::map_to_vector(operands, [](Value v) { return v.getType(); });
     operandTypes[0] = *distPayloadByWarpOpOrFailure;

     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypes, newRetIndices);
     SmallVector<Value> newOperands = llvm::map_to_vector(
         newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });

     SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
                                          ShapedType::kDynamic);
     DenseI64ArrayAttr newConstOffsetsAttr =
         rewriter.getDenseI64ArrayAttr(newConstOffsets);
     ValueRange currentOffsets =
         ValueRange(newOperands).drop_front(offsetsStartIdx);

     SmallVector<Value> newCoords = currentOffsets;
     rewriter.setInsertionPointAfter(newWarpOp);

     if (!matrixOp.getSubgroupBlockIoAttr()) {
       newCoords = computeDistributedCoordinatesForMatrixOp(
           rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
           currentOffsets);
     }

     xegpu::StoreMatrixOp::create(
         rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
         ValueRange(newCoords), newConstOffsetsAttr,
         matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
     rewriter.eraseOp(matrixOp);
     return success();
   }
 };

 /// Distribute a scattered load op. The logic and requirements are the same as
 /// for the scattered store distribution. The warpOp's payload vector is
 /// expected to be distributed by the load's result consumer.
 /// Example 1 (no chunk size):
 ///    %mask = producer_op : vector<16xi1>
 ///    %offset = producer_op : vector<16xindex>
 ///    %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
 ///    vector<16xindex>, vector<16xi1> -> vector<16xf16>
 /// To
 ///    %mask = producer_op : vector<1xi1>
 ///    %offset = producer_op : vector<1xindex>
 ///    %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
 ///     vector<1xindex>, vector<1xi1> -> vector<1xf16>
 /// Example 2 (chunk size, same mask and offsets):
 ///    %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
 ///     memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
 /// To
 ///    %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
 ///     memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
 ///
 /// Note that the load distribution pattern also handles leading unit dimensions
 /// in the payload, mask, and offsets vector.The load distribution will only
 /// change the dimensions corresponding to the SG distribution and keep the
 /// leading unit dimensions unchanged. For example, a load with result type
 /// vector<1x16xf16> with lane layout [1, 16 ] will be distributed
 /// as result type vector<1x1xf16>. Shapecast ops are inserted for the
 /// offset/mask/payload when necessary so that the distributed load is workign
 /// on 1D shape vector to match the HW capability.
 struct LoadDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
       // Check if the yield operand that was produced by the *last* scattered
       // load op to avoid sinking it before barriers (maintain memory order).
       return isa<xegpu::LoadGatherOp>(op) &&
              warpOp.getTerminator()->getPrevNode() == op;
     });
     if (!producedByLastLoad)
       return rewriter.notifyMatchFailure(
           warpOp, "The last op is not xegpu::LoadGatherOp");

     auto loadGatherOp =
         producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
     auto offsets = loadGatherOp.getOffsets();
     if (!offsets || !isa<VectorType>(offsets.getType()) ||
         !isa<VectorType>(loadGatherOp.getMask().getType()))
       return rewriter.notifyMatchFailure(
           loadGatherOp,
           "Load op must have a vector arguments for offsets and mask");
     VectorType offsetsTy = cast<VectorType>(offsets.getType());
     VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
     VectorType resultVecTy =
         cast<VectorType>(loadGatherOp.getResult().getType());
     // add handling leading unit dimensions support
     int chunkSize = loadGatherOp.getChunkSize().value_or(1);
     int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
     for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) {
       if (resultVecTy.getShape()[i] != 1) {
         return rewriter.notifyMatchFailure(
             loadGatherOp, "Only unit dimensions allowed for the leading "
                           "dimensions of the load vector!");
       }
     }

     auto layoutOffsets =
         xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(1));
     auto layoutMask = xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(2));

     FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
     FailureOr<VectorType> distMaskByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
     if (failed(distOffsetsByWarpOpOrFailure) ||
         failed(distMaskByWarpOpOrFailure)) {
       return rewriter.notifyMatchFailure(
           loadGatherOp,
           "Some vector operands have no layouts, using defaults instead.");
     }

     SmallVector<size_t> newRetIndices;
     SmallVector<Value> operands = loadGatherOp->getOperands();

     const unsigned operandIdx = producedByLastLoad->getOperandNumber();
     VectorType distResultTy =
         cast<VectorType>(warpOp.getResult(operandIdx).getType());
     VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
     VectorType distMaskTy = distMaskByWarpOpOrFailure.value();

     SmallVector<Type> operandTypesToYield = {operands[0].getType(),
                                              distOffsetsTy, distMaskTy};

     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypesToYield, newRetIndices);

     rewriter.setInsertionPointAfter(newWarpOp);

     // Distributed load op will always be 1D.
     VectorType loadVecTy1D = VectorType::get({distResultTy.getNumElements()},
                                              distResultTy.getElementType());

     VectorType distOffsetsTy1D =
         VectorType::get({distOffsetsByWarpOpOrFailure.value().getNumElements()},
                         distOffsetsByWarpOpOrFailure.value().getElementType());
     VectorType distMaskTy1D =
         VectorType::get({distMaskByWarpOpOrFailure.value().getNumElements()},
                         distMaskByWarpOpOrFailure.value().getElementType());

     Value distOffsetVal = resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[1]), distOffsetsTy1D, rewriter);
     Value distmaskVal = resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[2]), distMaskTy1D, rewriter);

     SmallVector<Value> newLoadGatherOperands = {
         newWarpOp.getResult(newRetIndices[0]), distOffsetVal, distmaskVal};

     xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
         rewriter, newWarpOp.getLoc(), loadVecTy1D, newLoadGatherOperands,
         loadGatherOp->getAttrs());
     xegpu::removeLayoutAttrs(newOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the output type and replace all uses.
     rewriter.replaceAllUsesWith(
         distributedVal,
         resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
     return success();
   }
 };

 // Sink SG-uniform ops. An op is uniform if none
 // of its operands/results has a distribution layout attribute.
 // Non-uniform vectors are handled by dedicated patterns.
 // This pattern must have a higher priority than vector dialect distribution
 // patterns, because a distributable shape may be logically intended as
 // uniform (i.e., no layout), so we want to omit its distribution.
 struct SinkUniformOps final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     // Take the last op
     Operation *warpRegionPreYieldOp = warpOp.getTerminator()->getPrevNode();
     // Any ops with nested regions must be handled carefully in dedicated
     // patterns.
     if (!warpRegionPreYieldOp || warpRegionPreYieldOp->getNumRegions())
       return failure();
     int operandIdx = -1;
     if (warpRegionPreYieldOp->getNumResults()) {
       OpOperand *operand = getWarpResult(
           warpOp, [&](Operation *op) { return warpRegionPreYieldOp == op; });
       if (!operand)
         return failure();
       operandIdx = operand->getOperandNumber();
       if (warpRegionPreYieldOp->getResult(0).getType() !=
           warpOp.getResult(operandIdx).getType())
         return rewriter.notifyMatchFailure(warpOp,
                                            "The op result is not uniform.");
     }

     // The op must have no layout-based operands or results.
     bool uniformValuesOnly =
         llvm::all_of(warpRegionPreYieldOp->getResults(), [](Value v) {
           return !xegpu::getDistributeLayoutAttr(v);
         });
     uniformValuesOnly &=
         llvm::all_of(warpRegionPreYieldOp->getOpOperands(), [](OpOperand &opr) {
           return !xegpu::getDistributeLayoutAttr(opr);
         });
     if (!uniformValuesOnly)
       return rewriter.notifyMatchFailure(warpOp,
                                          "Some values are not uniform.");
     SmallVector<size_t> newRetIndices;
     SmallVector<Value> operands =
         llvm::to_vector_of<Value>(warpRegionPreYieldOp->getOperands());
     SmallVector<Type> operandTypes =
         llvm::to_vector_of<Type>(warpRegionPreYieldOp->getOperandTypes());
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypes, newRetIndices);

     rewriter.setInsertionPointAfter(newWarpOp);
     IRMapping operandMapper;
     for (auto [oldOperandIdx, newOperandIdx] : llvm::enumerate(newRetIndices))
       operandMapper.map(warpRegionPreYieldOp->getOperand(oldOperandIdx),
                         newWarpOp->getResult(newOperandIdx));
     Operation *clonedOp = rewriter.clone(*warpRegionPreYieldOp, operandMapper);
     if (!clonedOp->getNumResults())
       rewriter.eraseOp(warpRegionPreYieldOp);
     else {
       assert(operandIdx != -1 && "Expected a warp result for the operation");
       rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx),
                                   clonedOp->getResult(0));
     }
     return success();
   }
 };

 /// This patterns distribute the `vector.multi_reduction` operation across
 /// lanes in a warp. Currently only 2D to 1D reductions are supported. Given
 /// layouts for the source and accumulator vectors,
 /// * If the reduction dimension is distributed across lanes, the reduction is
 ///   non-lane-local and the reduction is done using warp shuffles. Here we
 ///   simply rewrite the MultiDimReductionOp to a sequence of ReductionOps in
 ///   the warp op body.
 /// * If the reduction dimension is not distributed across lanes, the reduction
 ///   is lane-local. In this case, we yield the source and accumulator vectors
 ///   from the warp op and perform the lane-local reduction outside the warp op
 ///   using a sequence of ReductionOps.
 /// Example 1 (Reduction is lane-local):
 /// ```
 /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
 ///   %0 = "some_def"() : () -> (vector<16x32xf32>)
 ///   %acc = "some_def"() : () -> (vector<32xf32>)
 ///   %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<16x32xf32> to
 ///   vector<32xf32> gpu.yield %1 : vector<32xf32>
 /// }
 /// ```
 /// is lowered to:
 /// ```
 /// %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<16x1xf32>,
 /// vector<1xf32>) {
 ///   %0 = "some_def"() : () -> (vector<16x32xf32>)
 ///   %acc = "some_def"() : () -> (vector<32xf32>)
 ///   gpu.yield %0, %acc : vector<16x32xf32>, vector<32xf32>
 /// }
 /// %c = arith.constant dense<0.0> : vector<1xf32>
 /// %1 = vector.shape_cast %r#0 : vector<16x1xf32> to vector<16xf32>
 /// %2 = vector.reduction <add>, %1, %r#1 : vector<16xf32> to f32
 /// %3 = vector.insert %2, %c[0] : f32 into vector<1xf32>
 /// ```
 /// Example 2 (Reduction is non-lane-local):
 /// ```
 /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
 ///   %0 = "some_def"() : () -> (vector<2x32xf32>)
 ///   %acc = "some_def"() : () -> (vector<2xf32>)
 ///   %1 = vector.multi_reduction <add>, %0, %acc [1] : vector<2x32xf32> to
 ///   vector<2xf32>
 ///   gpu.yield %1 : vector<2xf32>
 /// }
 /// ```
 /// is lowered to:
 /// ```
 /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
 ///   %0 = "some_def"() : () -> (vector<2x32xf32>)
 ///   %acc = "some_def"() : () -> (vector<2xf32>)
 ///   %1 = arith.constant dense<0.0> : vector<2xf32>
 ///   %2 = vector.extract %0[0] : vector<32xf32> from <vector<2x32xf32>>
 ///   %3 = ("warp.reduction %2") : f32
 ///   %4 = vector.insert %3, %1[0] : f32 into vector<2xf32>
 ///   ... repeat for row 1
 ///   gpu.yield %1 : vector<2xf32>
 /// }
 struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *yieldOperand =
         getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>);
     if (!yieldOperand)
       return failure();
     auto reductionOp =
         cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
     unsigned operandIdx = yieldOperand->getOperandNumber();
     VectorType sourceType = reductionOp.getSourceVectorType();
     // Only 2D vectors are supported.
     if (sourceType.getRank() != 2)
       return rewriter.notifyMatchFailure(warpOp,
                                          "Only 2D reductions are supported.");
     ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
     // Only 1 reduction dimension supported. This also ensures that the result
     // is vector type.
     if (reductionDims.size() != 1)
       return rewriter.notifyMatchFailure(
           warpOp, "Only 1 reduction dimension is supported.");
     int64_t reductionDim = reductionDims[0];
     VectorType distributedResultType =
         cast<VectorType>(warpOp.getResult(operandIdx).getType());
     VectorType resultType = cast<VectorType>(reductionOp.getType());
     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getTemporaryLayout(reductionOp->getOpOperand(0));

     FailureOr<VectorType> sourceDistTypeOrFailure =
         getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
     if (failed(sourceDistTypeOrFailure))
       return rewriter.notifyMatchFailure(
           warpOp, "Failed to distribute the source vector type.");
     VectorType sourceDistType = sourceDistTypeOrFailure.value();
     // Only single dimension distribution is supported.
     bool dim0Distributed =
         sourceDistType.getShape()[0] != sourceType.getShape()[0];
     bool dim1Distributed =
         sourceDistType.getShape()[1] != sourceType.getShape()[1];
     if (dim0Distributed && dim1Distributed)
       return rewriter.notifyMatchFailure(
           warpOp, "Expecting source to be distributed in a single dimension.");
     int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
     if (sourceDistDim == -1)
       return rewriter.notifyMatchFailure(
           warpOp, "Expecting a distributed source vector.");
     bool resultDistributed =
         distributedResultType.getNumElements() < resultType.getNumElements();
     // If the lane owns all the data required for reduction (i.e. reduction is
     // fully parallel accross lanes), then each lane owns part of the result
     // (i.e. result is distributed). If the reduction require cross-lane
     // shuffling, then the result is shared among all lanes (broadcasted).
     // Therefore we expect following cases:
     //
     // | Source vector        | Reduction dim  | Result vector  |
     // |----------------------|----------------|----------------|
     // |  dim-0 distributed   |       0        | broadcasted    |
     // |  dim-0 distributed   |       1        | distributed    |
     // |  dim-1 distributed   |       0        | distributed    |
     // |  dim-1 distributed   |       1        | broadcasted    |

     bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
                                 (sourceDistDim == 1 && reductionDim == 0);
     if (isReductionLaneLocal && !resultDistributed)
       return rewriter.notifyMatchFailure(
           warpOp, "Expecting a distributed result for lane-local reduction.");

     if (!isReductionLaneLocal && resultDistributed)
       return rewriter.notifyMatchFailure(
           warpOp,
           "Expecting a broadcasted result for non-lane-local reduction.");

     // Handle lane-local reduction case. In this case we fully distribute the
     // reduction result.
     if (isReductionLaneLocal) {
       // Yield the source and acc vectors from the WarpOp.
       SmallVector<size_t> newRetIndices;
       auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
           rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
           {sourceDistType, distributedResultType}, newRetIndices);
       rewriter.setInsertionPointAfter(newWarpOp);
       Value result = xegpu::lowerToVectorReductions(
           cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
           cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
           reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
       // Replace the warp op result with the final result.
       rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
       return success();
     }
     // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
     // of multiple ReductionOps. Actual distribution is done by the
     // WarpOpReduction pattern.
     rewriter.setInsertionPointAfter(reductionOp);
     Value result = xegpu::lowerToVectorReductions(
         cast<TypedValue<VectorType>>(reductionOp.getSource()),
         cast<TypedValue<VectorType>>(reductionOp.getAcc()),
         reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
     // Replace the warp op result with the final result.
     rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
     return success();
   }
 };

 /// This pattern distributes the `vector.broadcast` operation across lanes in a
 /// warp. The pattern supports three use cases:
 ///
 /// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
 /// vector
 ///    must have a slice layout of the result. If the distributed source and
 ///    target vector types are identical, this lowers to a no-op; otherwise, it
 ///    remains a broadcast but operates on distributed vectors.
 ///
 /// 2) Broadcast a same-rank vector with identical layouts for source and
 /// target:
 ///    The source vector must have unit dimensions, and lane_data must be unit
 ///    size for those unit dims. This always lowers to a no-op.
 ///
 /// 3) Broadcast a scalar with no layout: This always lowers to a broadcast from
 ///    scalar to distributed result type.
 ///
 /// Example 1 (lowering to a broadcast with distributed types):
 /// ```
 /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
 ///   %0 = "some_def"() {layout_result_0 =
 ///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
 ///     dims = [0]> } : () -> (vector<32xf32>)
 ///   %2 = vector.broadcast %0 {layout_result_0 =
 ///     #xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>}
 ///     : vector<32xf32> to vector<8x32xf32>
 ///     gpu.yield %1 : vector<8x32xf32>
 /// }
 /// ```
 /// is lowered to:
 /// ```
 /// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
 ///   %0 = "some_def"() {layout_result_0 =
 ///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
 ///     dims = [0]> } : () -> (vector<32xf32>)
 ///   gpu.yield %0 : vector<32xf32>
 /// }
 /// %2 = vector.broadcast %r#0 : vector<1xf32> to vector<8x1xf32>
 ///
 /// Example 2 (no-op):
 /// ```
 /// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x32xf32>) {
 ///   %0 = "some_def"() {layout_result_0 =
 ///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
 ///     dims = [1]> } : () -> (vector<8xf32>)
 ///   %1 = vector.shape_cast %0
 ///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
 ///      1]>}: vector<8xf32> to vector<8x1xf32>
 ///   %2 = vector.broadcast %1
 ///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
 ///     1]>}: vector<8x1xf32> to vector<8x32xf32>
 ///   gpu.yield %1 : vector<8x32xf32>
 /// }
 /// ```
 /// is lowered to:
 /// ```
 /// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
 ///   %0 = "some_def"() {layout_result_0 =
 ///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
 ///     dims = [1]> } : () -> (vector<8xf32>)
 ///   %1 = vector.shape_cast %0
 ///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
 ///     1]>}: vector<8xf32> to vector<8x1xf32>
 ///   gpu.yield %1 : vector<8x1xf32>
 /// }
 /// // The broadcast is implicit through layout transformation (no-op)
 ///  "some_use"(%r#0)
 /// ```
 struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *yieldOperand =
         getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>);
     if (!yieldOperand)
       return failure();
     auto broadcastOp =
         cast<vector::BroadcastOp>(yieldOperand->get().getDefiningOp());
     unsigned operandIdx = yieldOperand->getOperandNumber();

     VectorType sourceType = dyn_cast<VectorType>(broadcastOp.getSourceType());
     VectorType destType =
         dyn_cast<VectorType>(broadcastOp.getResult().getType());

     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getTemporaryLayout(broadcastOp->getOpOperand(0));
     xegpu::DistributeLayoutAttr resultLayout =
         xegpu::getTemporaryLayout(dyn_cast<OpResult>(broadcastOp.getResult()));

     FailureOr<VectorType> sourceDistType;
     Type sourceElemOrDistType;
     if (sourceType) {

       // Case 1 and 2: source is a vector type.
       int64_t rankDiff = destType.getRank() - sourceType.getRank();
       if (rankDiff > 0) {
         // Case 1: source is lower-rank than result.
         bool isSliceOf = sourceLayout.isSliceOf(resultLayout);
         if (!isSliceOf)
           broadcastOp.emitWarning()
               << "Broadcast input layout must be a slice of result layout.";
       }
       // case 2: source and result have same rank
       if (rankDiff == 0) {
         auto broadcastUnitDimsSet = broadcastOp.computeBroadcastedUnitDims();
         SmallVector<int64_t> broadcastUnitDims(broadcastUnitDimsSet.begin(),
                                                broadcastUnitDimsSet.end());
         bool isEqualTo = sourceLayout.isEqualTo(resultLayout);
         if (!isEqualTo)
           return rewriter.notifyMatchFailure(
               warpOp, "For same-rank broadcast, source must be identical to "
                       "adjusted result layouts with unit dims.");
         resultLayout = resultLayout.setUnitDimData(broadcastUnitDims);
         sourceLayout = sourceLayout.setUnitDimLayout(broadcastUnitDims);
       }

       sourceDistType =
           getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
       if (failed(sourceDistType)) {
         return rewriter.notifyMatchFailure(
             warpOp, "Failed to distribute the source vector type.");
       }
       sourceElemOrDistType = sourceDistType.value();

     } else {
       // Case 3: source is a scalar type.
       if (sourceLayout) {
         return rewriter.notifyMatchFailure(
             warpOp, "Broadcast from scalar must not have a layout attribute.");
       }
       sourceElemOrDistType = broadcastOp.getSourceType();
     }
     FailureOr<VectorType> destDistType =
         getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
     if (failed(destDistType)) {
       return rewriter.notifyMatchFailure(
           warpOp, "Failed to distribute the dest vector type.");
     }

     SmallVector<size_t> newRetIndices;
     auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, {broadcastOp.getSource()}, sourceElemOrDistType,
         newRetIndices);

     Value distributedSource = newWarpOp.getResult(newRetIndices[0]);

     Value newBroadcast = distributedSource;

     if (sourceElemOrDistType != destDistType.value()) {
       rewriter.setInsertionPointAfter(newWarpOp);
       newBroadcast =
           vector::BroadcastOp::create(rewriter, newWarpOp.getLoc(),
                                       destDistType.value(), distributedSource);
     }

     rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newBroadcast);
     return success();
   }
 };

 /// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
 /// `gpu.warp_execute_on_lane_0` region.
 struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *yieldOperand =
         getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
     if (!yieldOperand)
       return failure();
     auto shapeCastOp =
         cast<vector::ShapeCastOp>(yieldOperand->get().getDefiningOp());
     unsigned operandNumber = yieldOperand->getOperandNumber();
     auto resultDistTy =
         cast<VectorType>(warpOp.getResult(operandNumber).getType());
     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getTemporaryLayout(shapeCastOp->getOpOperand(0));
     xegpu::DistributeLayoutAttr resultLayout =
         xegpu::getTemporaryLayout(dyn_cast<OpResult>(shapeCastOp.getResult()));
     if (!sourceLayout || !resultLayout)
       return rewriter.notifyMatchFailure(
           warpOp,
           "the source or result of shape_cast op lacks distribution layout");

     FailureOr<VectorType> sourceDistTypeOrFailure =
         getDistVecTypeBasedOnLaneLayout(sourceLayout,
                                         shapeCastOp.getSourceVectorType());
     if (failed(sourceDistTypeOrFailure))
       return rewriter.notifyMatchFailure(
           warpOp, "failed to get distributed vector type for source");
     VectorType sourceDistType = sourceDistTypeOrFailure.value();
     // Create a new warp op that yields the source of the shape_cast op.
     SmallVector<size_t> newRetIndices;
     auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType},
         newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     Value source = newWarpOp.getResult(newRetIndices[0]);
     // Create a new shape_cast op outside the warp op.
     Value newShapeCast = vector::ShapeCastOp::create(
         rewriter, shapeCastOp.getLoc(), resultDistTy, source);
     rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
                                 newShapeCast);
     return success();
   }
 };

 // Distribute a `vector.extract_strided_slice` op feeding into yield op of an
 // enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
 // advanced cases where the distributed dimension is partially extracted and
 // currently not supported by the generic vector distribution patterns.
 struct VectorExtractStridedSliceDistribution
     : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand =
         getWarpResult(warpOp, llvm::IsaPred<vector::ExtractStridedSliceOp>);
     if (!operand)
       return failure();
     auto extractOp =
         cast<vector::ExtractStridedSliceOp>(operand->get().getDefiningOp());
     unsigned operandIdx = operand->getOperandNumber();
     auto distributedType =
         cast<VectorType>(warpOp.getResult(operandIdx).getType());
     // Find the distributed dimensions.
     auto extractResultType = cast<VectorType>(operand->get().getType());
     auto distributedDims =
         getDistributedDims(extractResultType, distributedType);
     // Collect updated source type, sizes and offsets. They may be adjusted
     // later if the data is distributed to lanes (as opposed to being owned by
     // all lanes uniformly).
     VectorType updatedSourceType = extractOp.getSourceVectorType();
     SmallVector<Attribute> updatedSizes = llvm::map_to_vector(
         extractOp.getSizes(), [](Attribute attr) { return attr; });
     SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
         extractOp.getOffsets(), [](Attribute attr) { return attr; });
     SmallVector<Attribute> updatedStrides = llvm::map_to_vector(
         extractOp.getStrides(), [](Attribute attr) { return attr; });
     // If the provided sizes, offsets, strides are less than the rank, pad them
     // with full sizes, zero offsets, and unit strides. This makes it easier to
     // adjust them later.
     int64_t sourceRank = extractOp.getSourceVectorType().getRank();
     for (int64_t i = extractOp.getSizes().size(); i < sourceRank; ++i) {
       updatedSizes.push_back(rewriter.getI64IntegerAttr(
           extractOp.getSourceVectorType().getDimSize(i)));
       updatedOffsets.push_back(rewriter.getI64IntegerAttr(0));
       updatedStrides.push_back(
           rewriter.getI64IntegerAttr(1)); // stride is always 1.
     }
     // If the result is distributed, it must be distributed in exactly one
     // dimension. In this case, we adjust the sourceDistType, distributedSizes
     // and distributedOffsets accordingly.
     if (distributedDims.size() > 0) {
       if (distributedDims.size() != 1)
         return rewriter.notifyMatchFailure(
             warpOp, "Source can not be distributed in multiple dimensions.");
       int64_t distributedDim = distributedDims[0];
       int sourceDistrDimSize =
           extractOp.getSourceVectorType().getShape()[distributedDim];
       auto sourceLayout = xegpu::getTemporaryLayout(extractOp->getOpOperand(0));
       if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
         return rewriter.notifyMatchFailure(
             warpOp, "the source of extract_strided_slice op lacks distribution "
                     "layout");
       auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
       // Because only single dimension distribution is supported, lane layout
       // size at the distributed dim must be the subgroup size.
       int subgroupSize = sourceLaneLayout[distributedDim];
       // Check if the source size in the distributed dimension is a multiple of
       // subgroup size.
       if (sourceDistrDimSize % subgroupSize != 0)
         return rewriter.notifyMatchFailure(
             warpOp,
             "Source size along distributed dimension is not a multiple of "
             "subgroup size.");
       auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
       // We expect lane data to be all ones in this case.
       if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
         return rewriter.notifyMatchFailure(
             warpOp, "Expecting unit lane data in source layout");
       // The offsets in the distributed dimention must be a multiple of subgroup
       // size.
       int64_t distrDimOffset =
           cast<IntegerAttr>(updatedOffsets[distributedDim]).getInt();
       if (distrDimOffset % subgroupSize != 0)
         return rewriter.notifyMatchFailure(
             warpOp, "Offset along distributed dimension "
                     "is not a multiple of subgroup size.");
       updatedSourceType = getDistVecTypeBasedOnLaneLayout(
                               sourceLayout, extractOp.getSourceVectorType())
                               .value();
       // Update the distributed sizes to match the distributed type.
       updatedSizes[distributedDim] = rewriter.getI64IntegerAttr(
           distributedType.getDimSize(distributedDim));
       // Update the distributed offsets to match round robin distribution (i.e.
       // each lane owns data at `subgroupSize` stride given unit lane data).
       updatedOffsets[distributedDim] =
           rewriter.getI64IntegerAttr(distrDimOffset / subgroupSize);
     }
     // Do the distribution by yielding the source of the extract op from
     // the warp op and creating a new extract op outside the warp op.
     SmallVector<size_t> newRetIndices;
     auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, {extractOp.getSource()}, {updatedSourceType},
         newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     Value source = newWarpOp.getResult(newRetIndices[0]);
     // Create a new extract op outside the warp op.
     Value newExtractOp = vector::ExtractStridedSliceOp::create(
         rewriter, extractOp.getLoc(), distributedType, source,
         ArrayAttr::get(rewriter.getContext(), updatedOffsets),
         ArrayAttr::get(rewriter.getContext(), updatedSizes),
         ArrayAttr::get(rewriter.getContext(), updatedStrides));
     rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newExtractOp);
     return success();
   }
 };

 /// Distribute a `vector.insert_strided_slice` op feeding into yield op of an
 /// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
 /// advanced cases where the distributed dimension is partially inserted and
 /// currently not supported by the generic vector distribution patterns.
 struct VectorInsertStridedSliceDistribution
     : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
       // Check if the InsertStridedSliceOp is the last op before yield op
       return llvm::IsaPred<vector::InsertStridedSliceOp>(op) &&
              warpOp.getTerminator()->getPrevNode() == op;
     });
     if (!operand)
       return failure();
     unsigned int operandNumber = operand->getOperandNumber();
     auto insertOp =
         operand->get().getDefiningOp<vector::InsertStridedSliceOp>();
     auto distributedType =
         cast<VectorType>(warpOp.getResult(operandNumber).getType());
     // Find the distributed dimensions of the dest vector.
     auto insertResultType = cast<VectorType>(operand->get().getType());
     auto destDistributedDims =
         getDistributedDims(insertResultType, distributedType);
     // Collect updated offsets, source type and dest type. They may be adjusted
     // later if the data is distributed to lanes (as opposed to being owned by
     // all lanes uniformly).
     SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
         insertOp.getOffsets(), [](Attribute attr) { return attr; });
     VectorType updatedSourceType = insertOp.getSourceVectorType();
     VectorType updatedDestType = insertOp.getDestVectorType();
     if (destDistributedDims.size() > 0) {
       // Only single dimension distribution is supported.
       if (destDistributedDims.size() != 1)
         return rewriter.notifyMatchFailure(
             warpOp,
             "Expecting source to be distributed in a single dimension.");
       int64_t destDistributedDim = destDistributedDims[0];

       VectorType srcType = insertOp.getSourceVectorType();
       VectorType destType = insertOp.getDestVectorType();
       // Currently we require that both source (kD) and dest (nD) vectors are
       // distributed. This requires that distributedDim (d) is contained in the
       // last k dims of the dest vector (d >= n - k).
       int64_t sourceDistributedDim =
           destDistributedDim - (destType.getRank() - srcType.getRank());
       if (sourceDistributedDim < 0)
         return rewriter.notifyMatchFailure(
             insertOp,
             "distributed dimension must be in the last k (i.e. source "
             "rank) dims of dest vector");
       int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim);
       // Obtain the source and dest layouts.
       auto destLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(1));
       auto sourceLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(0));
       if (!destLayout || !sourceLayout ||
           destLayout.getEffectiveLaneLayoutAsInt().empty() ||
           sourceLayout.getEffectiveLaneLayoutAsInt().empty())
         return rewriter.notifyMatchFailure(
             warpOp, "the source or dest of insert_strided_slice op lacks "
                     "distribution layout");
       // Because only single dimension distribution is supported, lane layout
       // size at the distributed dim must be the subgroup size.
       int subgroupSize =
           destLayout.getEffectiveLaneLayoutAsInt()[destDistributedDim];
       // We require that source and dest lane data are all ones to ensure
       // uniform round robin distribution.
       auto destLaneData = destLayout.getEffectiveLaneDataAsInt();
       auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
       if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) ||
           !llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
         return rewriter.notifyMatchFailure(
             warpOp, "Expecting unit lane data in source and dest layouts");
       // Source distributed dim size must be multiples of subgroup size.
       if (srcDistrDimSize % subgroupSize != 0)
         return rewriter.notifyMatchFailure(
             warpOp, "Distributed dimension size in source is not a multiple of "
                     "subgroup size.");
       // Offsets in the distributed dimension must be multiples of subgroup
       // size.
       int64_t destDistrDimOffset =
           cast<IntegerAttr>(insertOp.getOffsets()[destDistributedDim]).getInt();
       if (destDistrDimOffset % subgroupSize != 0)
         return rewriter.notifyMatchFailure(
             warpOp,
             "Offset along distributed dimension in dest is not a multiple of "
             "subgroup size.");
       // Update the source and dest types based on their layouts.
       updatedSourceType = getDistVecTypeBasedOnLaneLayout(
                               sourceLayout, insertOp.getSourceVectorType())
                               .value();
       updatedDestType = getDistVecTypeBasedOnLaneLayout(
                             destLayout, insertOp.getDestVectorType())
                             .value();
       // Update the distributed offsets to match round robin distribution (i.e.
       // each lane owns data at `subgroupSize` stride given unit lane data).
       updatedOffsets[destDistributedDim] =
           rewriter.getI64IntegerAttr(destDistrDimOffset / subgroupSize);
     }
     // Do the distribution by yielding the source and dest of the insert op
     // from the warp op and creating a new insert op outside the warp op.
     SmallVector<size_t> newRetIndices;
     auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, {insertOp.getValueToStore(), insertOp.getDest()},
         {updatedSourceType, updatedDestType}, newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);

     Value valueToStore = newWarpOp.getResult(newRetIndices[0]);
     Value dest = newWarpOp.getResult(newRetIndices[1]);
     // Create a new insert op outside the warp op.
     Value newInsertOp = vector::InsertStridedSliceOp::create(
         rewriter, insertOp.getLoc(), updatedDestType, valueToStore, dest,
         ArrayAttr::get(rewriter.getContext(), updatedOffsets),
         insertOp.getStrides());
     rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
                                 newInsertOp);
     return success();
   }
 };

 /// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an
 /// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op
 /// outside of the warp op.
 struct MemrefExtractAlignedPointerAsIndexDistribution final
     : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand = getWarpResult(
         warpOp, llvm::IsaPred<memref::ExtractAlignedPointerAsIndexOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(
           warpOp,
           "warp result is not a memref::MemrefExtractAlignedPointerAsIndex op");
     auto extractOp =
         operand->get().getDefiningOp<memref::ExtractAlignedPointerAsIndexOp>();
     unsigned operandIdx = operand->getOperandNumber();
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, extractOp.getSource(),
         TypeRange{extractOp.getSource().getType()}, newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create(
         rewriter, newWarpOp.getLoc(), extractOp.getType(),
         newWarpOp.getResult(newRetIndices[0]));
     Value resultVal = newWarpOp.getResult(operandIdx);
     rewriter.replaceAllUsesWith(resultVal, newExtractOp.getResult());
     return success();
   }
 };

 /// Distribute a vector::BitCastOp feeding into yield op of an enclosing
 /// `gpu.warp_execute_on_lane_0` region. Bitcast only impacts the innermost
 /// diemension of the source/result vectors. Equivalent vector::BitCastOp is
 /// created outside of the warp op with distributed source vector type (computed
 /// using assigned layout).
 struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand =
         getWarpResult(warpOp, llvm::IsaPred<vector::BitCastOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(
           warpOp, "warp result is not a vector::BitCast op");
     auto bitcastOp = operand->get().getDefiningOp<vector::BitCastOp>();
     unsigned operandIdx = operand->getOperandNumber();
     VectorType distributedSourceType =
         getDistVecTypeBasedOnLaneLayout(
             xegpu::getTemporaryLayout(bitcastOp->getOpOperand(0)),
             bitcastOp.getSourceVectorType())
             .value_or(VectorType());
     if (!distributedSourceType)
       return rewriter.notifyMatchFailure(
           bitcastOp, "Failed to distribute the source vector type in "
                      "vector::BitCast op");
     VectorType distributedResultType =
         cast<VectorType>(warpOp.getResult(operandIdx).getType());
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, bitcastOp.getSource(),
         TypeRange{distributedSourceType}, newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     auto newBitcastOp = vector::BitCastOp::create(
         rewriter, newWarpOp.getLoc(), distributedResultType,
         newWarpOp.getResult(newRetIndices[0]));
     Value distributedVal = newWarpOp.getResult(operandIdx);
     rewriter.replaceAllUsesWith(distributedVal, newBitcastOp.getResult());
     return success();
   }
 };

 /// Distribute a vector::TransposeOp feeding into yield op of an enclosing
 /// `gpu.warp_execute_on_lane_0` region. Currently only 2D transposes are
 /// supported. In most cases, transpose is a no op because it is entirely
 /// handled using the layouts (e.g. 16x1 -> 1x16). However, if each lane owns
 /// multiple slices of data after distribution (e.g. 16x2 -> 2x16), a lane-local
 /// transpose (i.e. shuffle) is needed. Therefore, we create an equivalent
 /// vector::TransposeOp outside of the warp op with distributed source vector
 /// type (computed using assigned layout).
 struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand =
         getWarpResult(warpOp, llvm::IsaPred<vector::TransposeOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(
           warpOp, "warp result is not a vector::Transpose op");
     auto transposeOp = operand->get().getDefiningOp<vector::TransposeOp>();
     unsigned operandIdx = operand->getOperandNumber();
     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getTemporaryLayout(transposeOp->getOpOperand(0));
     xegpu::DistributeLayoutAttr resultLayout =
         xegpu::getTemporaryLayout(transposeOp->getOpResult(0));
     if (!sourceLayout || !resultLayout)
       return rewriter.notifyMatchFailure(
           transposeOp,
           "the source or result vector of the transpose op lacks layout "
           "attribute");
     int64_t sourceRank = transposeOp.getSourceVectorType().getRank();
     int64_t resultRank = transposeOp.getResultVectorType().getRank();
     // Only 2D transposes are supported for now.
     // TODO: Support nD transposes.
     if (sourceRank != 2 || resultRank != 2)
       return rewriter.notifyMatchFailure(
           transposeOp, "the source or result vector of the transpose op "
                        "does not have 2D layout");
     ArrayRef<int64_t> perm = transposeOp.getPermutation();
     // Result layout must be a transpose of source layout.
     if (!resultLayout.isTransposeOf(sourceLayout, perm,
                                     xegpu::LayoutKind::Lane))
       return rewriter.notifyMatchFailure(
           transposeOp,
           "the source or result vector layouts must be 2D transposes of each "
           "other");
     FailureOr<VectorType> distributedSourceTypeOrFailure =
         getDistVecTypeBasedOnLaneLayout(sourceLayout,
                                         transposeOp.getSourceVectorType());
     if (failed(distributedSourceTypeOrFailure))
       return rewriter.notifyMatchFailure(
           transposeOp, "Failed to distribute the source vector type in "
                        "vector::Transpose op");
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, transposeOp.getVector(),
         TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     auto newTransposeOp = vector::TransposeOp::create(
         rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]),
         perm);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult());
     return success();
   }
 };

 /// Distribute a vector::StepOp with the sliced result layout.
 /// The sliced layout must have exactly 1 effective lane dimension.
 /// We completely resolve the vector::StepOp by computing the lane_data-sized
 /// subranges.
 struct VectorStepSliceDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<vector::StepOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(
           warpOp, "warp result is not a vector::StepOp op");
     auto stepOp = operand->get().getDefiningOp<vector::StepOp>();
     unsigned operandIdx = operand->getOperandNumber();
     xegpu::DistributeLayoutAttr resultLayout =
         xegpu::getTemporaryLayout(stepOp->getResult(0));
     if (!resultLayout)
       return rewriter.notifyMatchFailure(
           stepOp, "the result vector of the step op lacks layout "
                   "attribute");
     auto sliceLayout = dyn_cast<xegpu::SliceAttr>(resultLayout);
     if (!sliceLayout)
       return rewriter.notifyMatchFailure(
           stepOp, "the result layout must be a slice layout");
     if (sliceLayout.getEffectiveLaneLayoutAsInt().size() != 1)
       return rewriter.notifyMatchFailure(
           stepOp, "expecting 1 dim in the effective result layout");

     rewriter.setInsertionPointAfter(warpOp);
     auto loc = stepOp.getLoc();
     auto stepResultVecTy = stepOp.getResult().getType();
     Value distributedVal = warpOp.getResult(operandIdx);
     VectorType newVecTy = cast<VectorType>(distributedVal.getType());

     auto laneDataBlockCoords = resultLayout.computeDistributedCoords(
         rewriter, loc, warpOp.getLaneid(), stepResultVecTy.getShape());
     if (failed(laneDataBlockCoords))
       return rewriter.notifyMatchFailure(
           stepOp, "failed to compute lane data block coordinates");

     auto laneDataBlockCoordsVec = laneDataBlockCoords.value();
     auto laneDataBlockLength = resultLayout.getEffectiveLaneDataAsInt()[0];
     assert(static_cast<int64_t>(laneDataBlockCoordsVec.size()) ==
            newVecTy.getNumElements() / laneDataBlockLength);
     SmallVector<Value> stepVals;
     // For each lane_data block, reconstruct its sub-range
     // from the range of SG-level vector.step. Example: vector.step
     // {slice<layout<lane_layout=[2,4,2], lane_data=[1,2,1]>, dims=[0,2]>} :
     // vector<16xindex>
     // Each logical lane holds 4 elements as 2 blocks of 2 elements each.
     // The blocks are round-robin distributed, so logical lane id 0
     // holds values [0,1, 8,9].
     for (auto &laneDataBlockCoords : laneDataBlockCoordsVec) {
       auto laneDataBlockStartCoord = laneDataBlockCoords[0];
       stepVals.push_back(laneDataBlockStartCoord);
       for (int i = 1; i < laneDataBlockLength; ++i) {
         auto offset = arith::ConstantIndexOp::create(rewriter, loc, i);
         stepVals.push_back(arith::AddIOp::create(
             rewriter, loc, laneDataBlockStartCoord, offset));
       }
     }
     assert(static_cast<int64_t>(stepVals.size()) == newVecTy.getNumElements() &&
            "Expecting the number of step values to match the number of "
            "elements in the vector");
     auto stepOpVal =
         vector::FromElementsOp::create(rewriter, loc, newVecTy, stepVals);
     rewriter.replaceAllUsesWith(distributedVal, stepOpVal);
     return success();
   }
 };

 struct ConvertLayoutDistribution
     : public OpRewritePattern<xegpu::ConvertLayoutOp> {
   using OpRewritePattern::OpRewritePattern;

   LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
                                 PatternRewriter &rewriter) const override {
     auto inputLayout = op.getInputLayoutAttr();
     auto targetLayout = op.getTargetLayoutAttr();

     if (!inputLayout || !targetLayout)
       return rewriter.notifyMatchFailure(op, "missing layout attributes");

     if (!inputLayout.isCompatibleWith(targetLayout, xegpu::LayoutKind::Lane)) {
       return rewriter.notifyMatchFailure(
           op, "lowering incompatible convert_layout not yet supported");
     }
     rewriter.replaceOp(op, op.getSource());
     return success();
   }
 };

 } // namespace

 namespace {
 struct XeGPUSubgroupDistributePass final
     : public xegpu::impl::XeGPUSubgroupDistributeBase<
           XeGPUSubgroupDistributePass> {
   void runOnOperation() override;
 };
 } // namespace

 void xegpu::populateXeGPUSubgroupDistributePatterns(
     RewritePatternSet &patterns) {
   patterns.add<CreateNdDescDistribution, StoreNdDistribution,
                LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
                GpuBarrierDistribution, VectorMultiReductionDistribution,
                LoadDistribution, StoreDistribution, VectorTransposeDistribution,
                VectorBitcastDistribution, LoadMatrixDistribution,
                StoreMatrixDistribution, ConvertLayoutDistribution,
                MemrefExtractAlignedPointerAsIndexDistribution>(
       patterns.getContext(),
       /*pattern benefit=*/PatternHierarchy::Regular);
   // For following patterns, we need to override the regular vector distribution
   // patterns. Therefore, assign higher benefit.
   patterns
       .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
            VectorInsertStridedSliceDistribution, VectorBroadcastDistribution,
            VectorStepSliceDistribution, SinkUniformOps>(
           patterns.getContext(),
           /*pattern benefit=*/PatternHierarchy::AboveRegular);
 }

 void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(
     RewritePatternSet &patterns) {
   patterns.add<MoveFuncBodyToWarpOp>(patterns.getContext());
 }

 void XeGPUSubgroupDistributePass::runOnOperation() {
   // Step 1: Attach layouts to op operands.
   // TODO: Following assumptions are made:
   // 1) It is assumed that there are no layout conflicts.
   // 2) Any existing layout attributes attached to the operands are ignored.
   Operation *op = getOperation();
   if (!xegpu::recoverTemporaryLayouts(op)) {
     signalPassFailure();
     return;
   }

   // Step 2: Move all operations of a GPU function inside
   // gpu.warp_execute_on_lane_0 operation.
   {
     RewritePatternSet patterns(&getContext());
     xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns);

     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       signalPassFailure();
       return;
     }
     // At this point, we have moved the entire function body inside the
     // warpOp. Now move any scalar uniform code outside of the warpOp (like
     // GPU index ops, scalar constants, etc.). This will simplify the
     // later lowering and avoid custom patterns for these ops.
     getOperation()->walk([&](Operation *op) {
       if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
         vector::moveScalarUniformCode(warpOp);
     });
   }
   // Step 3: Apply subgroup to workitem distribution patterns.
   RewritePatternSet patterns(&getContext());
   xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
   // distributionFn is used by vector distribution patterns to determine the
   // distributed vector type for a given vector value. In XeGPU subgroup
   // distribution context, we compute this based on lane layout.
   auto distributionFn = [](Value val) {
     VectorType vecType = dyn_cast<VectorType>(val.getType());
     int64_t vecRank = vecType ? vecType.getRank() : 0;
     if (vecRank == 0)
       return AffineMap::get(val.getContext());
     // Get the layout of the vector type.
     xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
     // If no layout is specified, assume uniform case (no distribution).
     if (!layout)
       return AffineMap::get(val.getContext());
     // Expecting vector and layout rank to match.
     assert(layout.getRank() == vecRank &&
            "Expecting vector and layout rank to match");
     // A dimension is distributed only if layout suggests there are
     // multiple lanes assigned for this dimension and the shape can be evenly
     // distributed to those lanes.
     SmallVector<unsigned int> distributedDims;
     for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
       if (v > 1 && vecType.getShape()[i] % v == 0)
         distributedDims.push_back(i);
     }
     return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
                                                 val.getContext());
   };
   // TODO: shuffleFn is not used.
   auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
                       int64_t warpSz) { return Value(); };

   vector::populateDistributeReduction(
       patterns, xegpu::subgroupReduction,
       /*pattern benefit=*/PatternHierarchy::Regular);

   vector::populatePropagateWarpVectorDistributionPatterns(
       patterns, distributionFn, shuffleFn,
       /*pattern benefit=*/PatternHierarchy::Regular);
   if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
     signalPassFailure();
     return;
   }

   // Step 4: Finally, clean up UnrealizedConversionCastOps that were inserted
   // due to tensor desc type mismatches created by using upstream distribution
   // patterns (scf.for). This cleanup should only be done if all the ops are
   // distributed successfully, if some ops are still not distributed and remains
   // inside any WarpExecuteOnLane0Op we avoid this simplication step to avoid
   // breaking the IR.
   bool foundWarpOp = false;
   getOperation()->walk([&](gpu::WarpExecuteOnLane0Op warpOp) {
     // Look for WarpOps that are not trivially dead.
     if (isOpTriviallyDead(warpOp))
       return WalkResult::advance();
     foundWarpOp = true;
     return WalkResult::interrupt();
   });
   if (foundWarpOp)
     return;

   getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
     // We are only interested in UnrealizedConversionCastOps there were added
     // for resolving SIMT type mismatches.
     if (!op->getAttr(resolveSIMTTypeMismatch))
       return WalkResult::skip();

     Value input = op.getOperand(0);
     Value output = op.getResult(0);

     // Both input and output must have tensor descriptor types.
     xegpu::TensorDescType inputDescType =
         mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
     xegpu::TensorDescType outputDescType =
         mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
     assert(inputDescType && outputDescType &&
            "Unrealized conversion cast must have tensor descriptor types");

     // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
     // This occurs inside scf.for body to resolve the block argument type to
     // SIMT type.
     if (inputDescType.getLayout()) {
       auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
       if (argument) {
         argument.setType(output.getType());
         output.replaceAllUsesWith(argument);
         if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
                 argument.getOwner()->getParentOp())) {
           auto result = loopOp.getTiedLoopResult(argument);
           result.setType(output.getType());
         }
       }
     }

     // tensor_desc<shape> -> tensor_desc<shape, layout> Type of
     // conversions. This occurs at the yield op of scf.for body to go back
     // from SIMT type to original type.
     if (outputDescType.getLayout())
       output.replaceAllUsesWith(input);

     if (op->use_empty())
       op->erase();
     return WalkResult::advance();
   });
 }