mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp - llvm-project.git - Git at Google

 //===---- XeGPUBlocking.cpp ---- XeGPU Blocking Pass ----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"

 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/DebugLog.h"

 namespace mlir {
 namespace xegpu {
 #define GEN_PASS_DEF_XEGPUBLOCKING
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
 } // namespace xegpu
 } // namespace mlir

 #define DEBUG_TYPE "xegpu-blocking"

 using namespace mlir;

 namespace {

 //===------------------------------------------------------------------------===//
 // The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops
 // to partition operations that process large shapes into multiple operations on
 // smaller shapes, as specified by the inst_data in the layout attribute. This
 // enables each resulting operation to be efficiently mapped to a hardware
 // instruction.
 //===------------------------------------------------------------------------===//

 class XeGPUBlockingPass final
     : public xegpu::impl::XeGPUBlockingBase<XeGPUBlockingPass> {
 public:
   void runOnOperation() override;

 private:
   // Get the tile shape for a given OpOperand or OpResult by examining the
   // corresponding layout attribute. If layout is not present or is not a
   // subgroup level layout, it returns std::nullopt.
   template <typename T,
             typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
                                         std::is_same_v<T, OpResult>>>
   std::optional<SmallVector<int64_t>>
   getTileShape(const T &operandOrResult) const;

   // Get the tile shape for a given operation.
   std::optional<SmallVector<int64_t>> getTileShape(Operation *op) const;

   // Determine if the operation requires unrolling. Return false if all operands
   // and results have tile shapes identical to their original types. Otherwise,
   // return true.
   bool needsUnroll(Operation *op) const;
 };
 } // namespace

 template <typename T, typename>
 std::optional<SmallVector<int64_t>>
 XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
   Value value;
   if constexpr (std::is_same_v<T, OpOperand>) {
     value = operandOrResult.get();
   } else {
     value = (Value)operandOrResult;
   }

   xegpu::DistributeLayoutAttr layout =
       xegpu::getDistributeLayoutAttr(operandOrResult);
   if (layout && layout.isForSubgroup()) {
     if (!layout.getEffectiveInstDataAsInt().empty()) {
       SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
       return instData;
     }
     if (auto type = dyn_cast<ShapedType>(value.getType()))
       return llvm::to_vector(type.getShape());
   }
   LDBG() << "failed to getTileShape for: " << value;
   return std::nullopt;
 }

 std::optional<SmallVector<int64_t>>
 XeGPUBlockingPass::getTileShape(Operation *op) const {
   if (isa<xegpu::CreateNdDescOp, xegpu::LoadMatrixOp>(op))
     return getTileShape(op->getOpResult(0));
   if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
           xegpu::StoreMatrixOp>(op))
     return getTileShape(op->getOpOperand(0));
   if (isa<xegpu::StoreNdOp>(op))
     return getTileShape(op->getOpOperand(1));

   if (isa<xegpu::LoadGatherOp>(op))
     return getTileShape(op->getOpResult(0));

   if (auto convertLayoutOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
     auto inputInstData =
         convertLayoutOp.getInputLayout().getEffectiveInstDataAsInt();
     auto targetInstData =
         convertLayoutOp.getTargetLayout().getEffectiveInstDataAsInt();
     // return the one with larger size
     if (computeProduct(inputInstData) >= computeProduct(targetInstData))
       return inputInstData;
     else
       return targetInstData;
   }

   if (isa<xegpu::StoreScatterOp>(op))
     return getTileShape(op->getOpOperand(0));

   // Helper lambda to validate and get A/B tiles
   auto validateABTiles = [&](Operation *op)
       -> std::optional<std::pair<SmallVector<int64_t>, SmallVector<int64_t>>> {
     std::optional<SmallVector<int64_t>> aTile =
         getTileShape(op->getOpOperand(0));
     std::optional<SmallVector<int64_t>> bTile =
         getTileShape(op->getOpOperand(1));

     if (!aTile || aTile->size() < 2 || !bTile || bTile->size() < 2)
       return std::nullopt;

     // Both must have the same number of batch dimensions.
     int64_t aBatchRank = aTile->size() - 2;
     int64_t bBatchRank = bTile->size() - 2;
     if (aBatchRank != bBatchRank)
       return std::nullopt;

     // Batch dimensions must match.
     for (int64_t i = 0; i < aBatchRank; ++i) {
       if ((*aTile)[i] != (*bTile)[i])
         return std::nullopt;
     }

     // Semantic check for A and B: K dimension must match.
     // A[..., M, K] x B[..., K, N]
     if ((*aTile).back() != (*bTile)[bBatchRank])
       return std::nullopt;

     return std::make_pair(*aTile, *bTile);
   };

   // Helper lambda to validate C tile
   auto validateCTile = [&](Operation *op, unsigned cOperandIdx,
                            const SmallVector<int64_t> &aTile,
                            const SmallVector<int64_t> &bTile) -> bool {
     if (op->getNumOperands() <= cOperandIdx)
       return true;

     std::optional<SmallVector<int64_t>> cTile =
         getTileShape(op->getOpOperand(cOperandIdx));
     if (!cTile)
       return false;
     // Expected C tile: batch dims from A + [M, N]
     int64_t aBatchRank = aTile.size() - 2;
     SmallVector<int64_t> expectedCTile(aTile.begin(),
                                        aTile.begin() + aBatchRank);
     expectedCTile.push_back(aTile[aBatchRank]); // M from A
     expectedCTile.push_back(bTile.back());      // N from B
     if (!llvm::equal(*cTile, expectedCTile))
       return false;
     return true;
   };

   // Helper lambda to validate scale A tile for DpasMxOp
   auto validateScaleATile =
       [&](Operation *op, unsigned scaleAOperandIdx,
           const SmallVector<int64_t> &aTile) -> std::optional<int64_t> {
     std::optional<SmallVector<int64_t>> aScaleTile =
         getTileShape(op->getOpOperand(scaleAOperandIdx));

     if (!aScaleTile || aScaleTile->size() < 2)
       return std::nullopt;

     // Validate scale_a tile: [batch..., M_tile, K_scale]
     // M dimension (second-to-last) must match A's M dimension
     int64_t scaleRank = aScaleTile->size();
     int64_t aBatchRank = aTile.size() - 2;
     if ((*aScaleTile)[scaleRank - 2] != aTile[aBatchRank])
       return std::nullopt;

     // Return the K scale factor (last dim)
     return aScaleTile->back();
   };

   // Helper lambda to validate scale B tile for DpasMxOp
   auto validateScaleBTile =
       [&](Operation *op, unsigned scaleBOperandIdx,
           const SmallVector<int64_t> &bTile) -> std::optional<int64_t> {
     std::optional<SmallVector<int64_t>> bScaleTile =
         getTileShape(op->getOpOperand(scaleBOperandIdx));

     if (!bScaleTile || bScaleTile->size() < 2)
       return std::nullopt;

     // Validate scale_b tile: [batch..., K_scale, N_tile]
     // N dimension (last) must match B's N dimension (last)
     if (bScaleTile->back() != bTile.back())
       return std::nullopt;

     // Return the K scale factor (second-to-last dim)
     int64_t scaleRank = bScaleTile->size();
     return (*bScaleTile)[scaleRank - 2];
   };

   if (isa<xegpu::DpasOp>(op)) {
     auto abTiles = validateABTiles(op);
     if (!abTiles)
       return std::nullopt;

     auto [aTile, bTile] = *abTiles;

     // Semantic check for C.
     if (!validateCTile(op, 2, aTile, bTile))
       return std::nullopt;

     // Return [batch..., M, K, N] as the target shape for unrolling.
     int64_t aBatchRank = aTile.size() - 2;
     SmallVector<int64_t> tileShape(aTile.begin(), aTile.begin() + aBatchRank);
     tileShape.push_back(aTile[aBatchRank]);     // M
     tileShape.push_back(aTile[aBatchRank + 1]); // K
     tileShape.push_back(bTile.back());          // N
     return tileShape;
   }

   if (auto dpasMxOp = dyn_cast<xegpu::DpasMxOp>(op)) {
     auto abTiles = validateABTiles(op);
     if (!abTiles)
       return std::nullopt;

     auto [aTile, bTile] = *abTiles;

     // Validate C tile if present using op-specific accessor
     if (dpasMxOp.getAcc()) {
       unsigned accOperandIdx = 2; // acc is the 3rd operand
       if (!validateCTile(op, accOperandIdx, aTile, bTile))
         return std::nullopt;
     }

     // Validate scale tiles if present using op-specific accessors
     int64_t kScaleFactor = 1;
     std::optional<int64_t> scaleAFactor;
     std::optional<int64_t> scaleBFactor;

     if (dpasMxOp.getScaleA()) {
       unsigned scaleAOperandIdx = 2 + (dpasMxOp.getAcc() ? 1 : 0);
       scaleAFactor = validateScaleATile(op, scaleAOperandIdx, aTile);
       if (!scaleAFactor)
         return std::nullopt;
     }

     if (dpasMxOp.getScaleB()) {
       unsigned scaleBOperandIdx =
           2 + (dpasMxOp.getAcc() ? 1 : 0) + (dpasMxOp.getScaleA() ? 1 : 0);
       scaleBFactor = validateScaleBTile(op, scaleBOperandIdx, bTile);
       if (!scaleBFactor)
         return std::nullopt;
     }

     // If both scales are present, their K dimensions must match
     if (scaleAFactor && scaleBFactor) {
       if (*scaleAFactor != *scaleBFactor)
         return std::nullopt;
       kScaleFactor = *scaleAFactor;
     } else if (scaleAFactor) {
       kScaleFactor = *scaleAFactor;
     } else if (scaleBFactor) {
       kScaleFactor = *scaleBFactor;
     }

     // Return [batch..., M, K, N, S] as the target shape for unrolling.
     int64_t aBatchRank = aTile.size() - 2;
     SmallVector<int64_t> tileShape(aTile.begin(), aTile.begin() + aBatchRank);
     tileShape.push_back(aTile[aBatchRank]);     // M
     tileShape.push_back(aTile[aBatchRank + 1]); // K
     tileShape.push_back(bTile.back());          // N
     tileShape.push_back(kScaleFactor);          // S
     return tileShape;
   }

   if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)
     return getTileShape(op->getOpResult(0));

   if (isa<vector::MultiDimReductionOp>(op))
     return getTileShape(op->getOpOperand(0));

   if (isa<vector::TransposeOp, vector::BroadcastOp, vector::StepOp,
           vector::ShapeCastOp, vector::ConstantMaskOp, vector::CreateMaskOp,
           vector::BitCastOp, vector::InterleaveOp, vector::DeinterleaveOp>(op))
     return getTileShape(op->getOpResult(0));

   return std::nullopt;
 }

 bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
   // skip the op if any of its operands or results has workgroup level layouts
   bool hasWgLayoutOperands =
       llvm::any_of(op->getOpOperands(), [](OpOperand &opr) {
         xegpu::DistributeLayoutAttr layout =
             xegpu::getDistributeLayoutAttr(opr);
         return layout && layout.isForWorkgroup();
       });
   bool hasWgLayoutResults =
       llvm::any_of(op->getOpResults(), [](OpResult result) {
         xegpu::DistributeLayoutAttr layout =
             xegpu::getDistributeLayoutAttr(result);
         return layout && layout.isForWorkgroup();
       });
   if (hasWgLayoutOperands || hasWgLayoutResults) {
     LDBG() << "skip unrolling for op with workgroup level layout: " << *op;
     return false;
   }

   auto isUnrollable = [](Value value, ArrayRef<int64_t> tileShape) {
     Type valTy = value.getType();
     if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(valTy)) {
       xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr();
       return layout && !layout.getEffectiveInstDataAsInt().empty();
     }
     auto shapedType = dyn_cast<ShapedType>(valTy);
     return shapedType && !llvm::equal(tileShape, shapedType.getShape());
   };

   bool hasUnrollableOperands =
       llvm::any_of(op->getOpOperands(), [&](OpOperand &opr) {
         std::optional<SmallVector<int64_t>> tileShape = getTileShape(opr);
         return tileShape.has_value() && isUnrollable(opr.get(), *tileShape);
       });
   bool hasUnrollableResults =
       llvm::any_of(op->getOpResults(), [&](OpResult result) {
         std::optional<SmallVector<int64_t>> tileShape = getTileShape(result);
         return tileShape.has_value() && isUnrollable(result, *tileShape);
       });
   // ConvertLayoutOp must be processed to drop the inst_data in the layout
   bool isConvertLayoutWithInstData = false;
   if (auto convertLayoutOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
     auto targettLayout = convertLayoutOp.getTargetLayout();
     if (targettLayout && !targettLayout.getEffectiveInstDataAsInt().empty()) {
       isConvertLayoutWithInstData = true;
     }
   }
   return hasUnrollableOperands || hasUnrollableResults ||
          isConvertLayoutWithInstData;
 }

 void XeGPUBlockingPass::runOnOperation() {
   MLIRContext *ctx = &getContext();
   Operation *op = getOperation();

   if (!xegpu::recoverTemporaryLayouts(op)) {
     signalPassFailure();
     return;
   }

   auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
                                  xegpu::DistributeLayoutAttr layout) {
     int count = 1;
     SmallVector<int64_t> tileShape(shape);
     if (layout && !layout.getEffectiveInstDataAsInt().empty()) {
       tileShape = layout.getEffectiveInstDataAsInt();
       count = computeProduct(shape) / computeProduct(tileShape);
     }
     assert(count >= 1 && "count must be at least 1");
     return std::make_pair(tileShape, count);
   };

   // Perform context-aware type conversion for SCF structural ops.
   // Inspects Values to find inst_data layout information for 1:N conversion.
   llvm::SmallSetVector<UnrealizedConversionCastOp, 8> existingCasts;
   op->walk(
       [&](UnrealizedConversionCastOp castOp) { existingCasts.insert(castOp); });

   {
     TypeConverter converter;
     converter.addConversion([](Type type) -> Type { return type; });

     // TensorDescType 1:N converter (type-based, layout is in the type).
     converter.addConversion(
         [&](xegpu::TensorDescType type,
             SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
           Type elemTy = type.getElementType();
           ArrayRef<int64_t> shape = type.getShape();

           xegpu::DistributeLayoutAttr layout = type.getLayoutAttr();
           if (layout && layout.isForWorkgroup())
             return failure();

           int count;
           SmallVector<int64_t> subShape;
           std::tie(subShape, count) = getTileShapeAndCount(shape, layout);

           if (layout)
             layout = layout.dropInstData();

           auto newTy = xegpu::TensorDescType::get(
               type.getContext(), subShape, elemTy, type.getEncoding(), layout);
           result.append(count, newTy);
           return success();
         });

     // Context-aware VectorType conversion based on inst_data (1:1
     // shape-changing or 1:N).
     auto getSubShapeAndCount = [&](VectorType vecTy,
                                    xegpu::DistributeLayoutAttr layout)
         -> std::pair<SmallVector<int64_t>, int> {
       return getTileShapeAndCount(vecTy.getShape(), layout);
     };
     auto loopArgTypes =
         xegpu::precomputeLoopBlockArgTypes(op, getSubShapeAndCount);
     xegpu::addVectorTypeConversion(converter, getSubShapeAndCount,
                                    std::move(loopArgTypes));

     // Loop-carried types are now in the converter's map, so the transient
     // per-position layout attrs on SCF loop ops are no longer needed. Strip
     // them before converting: the SCF converters copy old attrs onto the new
     // op (ConvertForOpTypes::setAttrs), and after 1:N result expansion a stale
     // `layout_result_N` lands on the wrong (renumbered) result, corrupting the
     // count invariant and leaving the loop illegal.
     op->walk([](Operation *loopOp) {
       if (!isa<scf::ForOp, scf::WhileOp, scf::ConditionOp>(loopOp))
         return;
       SmallVector<StringRef> toRemove;
       for (const NamedAttribute &attr : loopOp->getAttrs()) {
         StringRef name = attr.getName().strref();
         if (name.starts_with("layout_operand_") ||
             name.starts_with("layout_result_"))
           toRemove.push_back(name);
       }
       for (StringRef name : toRemove)
         loopOp->removeAttr(name);
     });

     // Source (N:1) and target (1:1) materializations using
     // UnrealizedConversionCastOp.
     auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,
                               Location loc) -> Value {
       return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
           .getResult(0);
     };
     converter.addSourceMaterialization(materializeCast);
     converter.addTargetMaterialization(materializeCast);
     // Blocking runs SCF conversion separately (not combined with XeGPU
     // patterns), so it also needs a 1:N target materialization.
     converter.addTargetMaterialization(
         [](mlir::OpBuilder &builder, mlir::TypeRange types,
            mlir::ValueRange inputs, mlir::Location loc) -> SmallVector<Value> {
           auto castOp =
               UnrealizedConversionCastOp::create(builder, loc, types, inputs);
           return SmallVector<Value>(castOp.getResults());
         });

     ConversionTarget target(*ctx);
     target.addLegalOp<UnrealizedConversionCastOp>();
     target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });

     RewritePatternSet scfPatterns(ctx);
     scf::populateSCFStructuralTypeConversionsAndLegality(converter, scfPatterns,
                                                          target);
     if (failed(applyPartialConversion(op, target, std::move(scfPatterns))))
       return signalPassFailure();

     // Fold cancelling cast chains and erase dead casts.
     xegpu::cleanupUnrealizedConversionCasts(op, existingCasts);
   }

   xegpu::UnrollOptions options;
   options.setFilterConstraint(
       [&](Operation *op) -> LogicalResult { return success(needsUnroll(op)); });

   options.setNativeShapeFn([&](Operation *op) { return getTileShape(op); });

   options.setUnrolledTypesFn([&](ShapedType type, ArrayRef<int64_t> tileShape) {
     Type elemTy = type.getElementType();

     if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type)) {

       Attribute encoding = tdescTy.getEncoding();

       xegpu::TensorDescType newTy =
           xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
                                      tdescTy.getLayoutAttr().dropInstData());
       // Compute the product of batch (higher) dimensions.
       ArrayRef<int64_t> shape = type.getShape();
       int64_t batchCount =
           shape.size() > 2 ? computeProduct(shape.drop_back(2)) : 1;
       return SmallVector<Type>(batchCount, newTy);
     }
     Type newTy = VectorType::get(tileShape, elemTy);

     std::optional<SmallVector<int64_t>> ratio =
         computeShapeRatio(type.getShape(), tileShape);
     assert(ratio && "The shape of the type must be a multiple of tileShape.");
     return SmallVector<Type>(computeProduct(*ratio), newTy);
   });

   RewritePatternSet patterns(ctx);
   vector::UnrollVectorOptions vectorOptions;
   vectorOptions.setNativeShapeFn(options.nativeShape);

   populateXeGPUUnrollPatterns(patterns, options);
   vector::populateVectorUnrollPatterns(patterns, vectorOptions);

   // Note: The pattern driver does op folding as well and clean up.
   // But intermediate insert/extract strided slice ops with
   // unrealized conversion cast ops in the middle does not get
   // cleaned up in this step. One more round of folding is needed
   // after the walk to resolve those unrealized conversion cast ops.
   (void)applyPatternsGreedily(op, std::move(patterns));

   op->walk([](Operation *op) {
     // Remove the layout attributes cached per operands.
     for (OpOperand &opr : op->getOpOperands()) {
       std::string name = xegpu::getTemporaryLayoutName(opr);
       if (op->hasAttrOfType<xegpu::DistributeLayoutAttr>(name))
         op->removeAttr(name);
     }

     // Update the layout attributes per result.
     for (OpResult result : op->getOpResults()) {
       std::string name = xegpu::getTemporaryLayoutName(result);
       if (auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
         op->removeAttr(name);
         if (!isa<LoopLikeOpInterface>(op))
           xegpu::setDistributeLayoutAttr(result, layout.dropInstData());
       }
     }

     // Drop left-over inst_data if the unroll pattern does not being applied,
     // say, inst_data just matches their shape.
     SmallVector<NamedAttribute> newAttrs =
         xegpu::dropInstDataOnAttrs(op->getAttrs());
     op->setAttrs(newAttrs);
   });

   // Resolve UnrealizedConversionCastOps generated by SCF structural type
   // conversion and by XeGPU/Vector unrolling (cancelling cast chains and
   // unpaired pack/unpack casts).
   xegpu::cleanupUnrealizedConversionCasts(op, existingCasts);

   // One more round of folding to clean up the intermediate
   // insert/extract strided slice ops.
   RewritePatternSet emptyPatterns(ctx);
   (void)applyPatternsGreedily(op, std::move(emptyPatterns));
 }
	//===---- XeGPUBlocking.cpp ---- XeGPU Blocking Pass ----------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/XeGPU/Transforms/Passes.h"

	#include "mlir/Dialect/Index/IR/IndexDialect.h"
	#include "mlir/Dialect/SCF/IR/SCF.h"
	#include "mlir/Dialect/SCF/Transforms/Patterns.h"
	#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
	#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
	#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
	#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
	#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
	#include "mlir/Interfaces/LoopLikeInterface.h"
	#include "mlir/Pass/PassManager.h"
	#include "mlir/Transforms/DialectConversion.h"
	#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/Support/DebugLog.h"

	namespace mlir {
	namespace xegpu {
	#define GEN_PASS_DEF_XEGPUBLOCKING
	#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
	} // namespace xegpu
	} // namespace mlir

	#define DEBUG_TYPE "xegpu-blocking"

	using namespace mlir;

	namespace {

	//===------------------------------------------------------------------------===//
	// The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops
	// to partition operations that process large shapes into multiple operations on
	// smaller shapes, as specified by the inst_data in the layout attribute. This
	// enables each resulting operation to be efficiently mapped to a hardware
	// instruction.
	//===------------------------------------------------------------------------===//

	class XeGPUBlockingPass final
	: public xegpu::impl::XeGPUBlockingBase<XeGPUBlockingPass> {
	public:
	void runOnOperation() override;

	private:
	// Get the tile shape for a given OpOperand or OpResult by examining the
	// corresponding layout attribute. If layout is not present or is not a
	// subgroup level layout, it returns std::nullopt.
	template <typename T,
	typename = std::enable_if_t<std::is_same_v<T, OpOperand> \|\|
	std::is_same_v<T, OpResult>>>
	std::optional<SmallVector<int64_t>>
	getTileShape(const T &operandOrResult) const;

	// Get the tile shape for a given operation.
	std::optional<SmallVector<int64_t>> getTileShape(Operation *op) const;

	// Determine if the operation requires unrolling. Return false if all operands
	// and results have tile shapes identical to their original types. Otherwise,
	// return true.
	bool needsUnroll(Operation *op) const;
	};
	} // namespace

	template <typename T, typename>
	std::optional<SmallVector<int64_t>>
	XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
	Value value;
	if constexpr (std::is_same_v<T, OpOperand>) {
	value = operandOrResult.get();
	} else {
	value = (Value)operandOrResult;
	}

	xegpu::DistributeLayoutAttr layout =
	xegpu::getDistributeLayoutAttr(operandOrResult);
	if (layout && layout.isForSubgroup()) {
	if (!layout.getEffectiveInstDataAsInt().empty()) {
	SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
	return instData;
	}
	if (auto type = dyn_cast<ShapedType>(value.getType()))
	return llvm::to_vector(type.getShape());
	}
	LDBG() << "failed to getTileShape for: " << value;
	return std::nullopt;
	}

	std::optional<SmallVector<int64_t>>
	XeGPUBlockingPass::getTileShape(Operation *op) const {
	if (isa<xegpu::CreateNdDescOp, xegpu::LoadMatrixOp>(op))
	return getTileShape(op->getOpResult(0));
	if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
	xegpu::StoreMatrixOp>(op))
	return getTileShape(op->getOpOperand(0));
	if (isa<xegpu::StoreNdOp>(op))
	return getTileShape(op->getOpOperand(1));

	if (isa<xegpu::LoadGatherOp>(op))
	return getTileShape(op->getOpResult(0));

	if (auto convertLayoutOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
	auto inputInstData =
	convertLayoutOp.getInputLayout().getEffectiveInstDataAsInt();
	auto targetInstData =
	convertLayoutOp.getTargetLayout().getEffectiveInstDataAsInt();
	// return the one with larger size
	if (computeProduct(inputInstData) >= computeProduct(targetInstData))
	return inputInstData;
	else
	return targetInstData;
	}

	if (isa<xegpu::StoreScatterOp>(op))
	return getTileShape(op->getOpOperand(0));

	// Helper lambda to validate and get A/B tiles
	auto validateABTiles = [&](Operation *op)
	-> std::optional<std::pair<SmallVector<int64_t>, SmallVector<int64_t>>> {
	std::optional<SmallVector<int64_t>> aTile =
	getTileShape(op->getOpOperand(0));
	std::optional<SmallVector<int64_t>> bTile =
	getTileShape(op->getOpOperand(1));

	if (!aTile \|\| aTile->size() < 2 \|\| !bTile \|\| bTile->size() < 2)
	return std::nullopt;

	// Both must have the same number of batch dimensions.
	int64_t aBatchRank = aTile->size() - 2;
	int64_t bBatchRank = bTile->size() - 2;
	if (aBatchRank != bBatchRank)
	return std::nullopt;

	// Batch dimensions must match.
	for (int64_t i = 0; i < aBatchRank; ++i) {
	if ((aTile)[i] != (bTile)[i])
	return std::nullopt;
	}

	// Semantic check for A and B: K dimension must match.
	// A[..., M, K] x B[..., K, N]
	if ((aTile).back() != (bTile)[bBatchRank])
	return std::nullopt;

	return std::make_pair(aTile, bTile);
	};

	// Helper lambda to validate C tile
	auto validateCTile = [&](Operation *op, unsigned cOperandIdx,
	const SmallVector<int64_t> &aTile,
	const SmallVector<int64_t> &bTile) -> bool {
	if (op->getNumOperands() <= cOperandIdx)
	return true;

	std::optional<SmallVector<int64_t>> cTile =
	getTileShape(op->getOpOperand(cOperandIdx));
	if (!cTile)
	return false;
	// Expected C tile: batch dims from A + [M, N]
	int64_t aBatchRank = aTile.size() - 2;
	SmallVector<int64_t> expectedCTile(aTile.begin(),
	aTile.begin() + aBatchRank);
	expectedCTile.push_back(aTile[aBatchRank]); // M from A
	expectedCTile.push_back(bTile.back()); // N from B
	if (!llvm::equal(*cTile, expectedCTile))
	return false;
	return true;
	};

	// Helper lambda to validate scale A tile for DpasMxOp
	auto validateScaleATile =
	[&](Operation *op, unsigned scaleAOperandIdx,
	const SmallVector<int64_t> &aTile) -> std::optional<int64_t> {
	std::optional<SmallVector<int64_t>> aScaleTile =
	getTileShape(op->getOpOperand(scaleAOperandIdx));

	if (!aScaleTile \|\| aScaleTile->size() < 2)
	return std::nullopt;

	// Validate scale_a tile: [batch..., M_tile, K_scale]
	// M dimension (second-to-last) must match A's M dimension
	int64_t scaleRank = aScaleTile->size();
	int64_t aBatchRank = aTile.size() - 2;
	if ((*aScaleTile)[scaleRank - 2] != aTile[aBatchRank])
	return std::nullopt;

	// Return the K scale factor (last dim)
	return aScaleTile->back();
	};

	// Helper lambda to validate scale B tile for DpasMxOp
	auto validateScaleBTile =
	[&](Operation *op, unsigned scaleBOperandIdx,
	const SmallVector<int64_t> &bTile) -> std::optional<int64_t> {
	std::optional<SmallVector<int64_t>> bScaleTile =
	getTileShape(op->getOpOperand(scaleBOperandIdx));

	if (!bScaleTile \|\| bScaleTile->size() < 2)
	return std::nullopt;

	// Validate scale_b tile: [batch..., K_scale, N_tile]
	// N dimension (last) must match B's N dimension (last)
	if (bScaleTile->back() != bTile.back())
	return std::nullopt;

	// Return the K scale factor (second-to-last dim)
	int64_t scaleRank = bScaleTile->size();
	return (*bScaleTile)[scaleRank - 2];
	};

	if (isa<xegpu::DpasOp>(op)) {
	auto abTiles = validateABTiles(op);
	if (!abTiles)
	return std::nullopt;

	auto [aTile, bTile] = *abTiles;

	// Semantic check for C.
	if (!validateCTile(op, 2, aTile, bTile))
	return std::nullopt;

	// Return [batch..., M, K, N] as the target shape for unrolling.
	int64_t aBatchRank = aTile.size() - 2;
	SmallVector<int64_t> tileShape(aTile.begin(), aTile.begin() + aBatchRank);
	tileShape.push_back(aTile[aBatchRank]); // M
	tileShape.push_back(aTile[aBatchRank + 1]); // K
	tileShape.push_back(bTile.back()); // N
	return tileShape;
	}

	if (auto dpasMxOp = dyn_cast<xegpu::DpasMxOp>(op)) {
	auto abTiles = validateABTiles(op);
	if (!abTiles)
	return std::nullopt;

	auto [aTile, bTile] = *abTiles;

	// Validate C tile if present using op-specific accessor
	if (dpasMxOp.getAcc()) {
	unsigned accOperandIdx = 2; // acc is the 3rd operand
	if (!validateCTile(op, accOperandIdx, aTile, bTile))
	return std::nullopt;
	}

	// Validate scale tiles if present using op-specific accessors
	int64_t kScaleFactor = 1;
	std::optional<int64_t> scaleAFactor;
	std::optional<int64_t> scaleBFactor;

	if (dpasMxOp.getScaleA()) {
	unsigned scaleAOperandIdx = 2 + (dpasMxOp.getAcc() ? 1 : 0);
	scaleAFactor = validateScaleATile(op, scaleAOperandIdx, aTile);
	if (!scaleAFactor)
	return std::nullopt;
	}

	if (dpasMxOp.getScaleB()) {
	unsigned scaleBOperandIdx =
	2 + (dpasMxOp.getAcc() ? 1 : 0) + (dpasMxOp.getScaleA() ? 1 : 0);
	scaleBFactor = validateScaleBTile(op, scaleBOperandIdx, bTile);
	if (!scaleBFactor)
	return std::nullopt;
	}

	// If both scales are present, their K dimensions must match
	if (scaleAFactor && scaleBFactor) {
	if (scaleAFactor != scaleBFactor)
	return std::nullopt;
	kScaleFactor = *scaleAFactor;
	} else if (scaleAFactor) {
	kScaleFactor = *scaleAFactor;
	} else if (scaleBFactor) {
	kScaleFactor = *scaleBFactor;
	}

	// Return [batch..., M, K, N, S] as the target shape for unrolling.
	int64_t aBatchRank = aTile.size() - 2;
	SmallVector<int64_t> tileShape(aTile.begin(), aTile.begin() + aBatchRank);
	tileShape.push_back(aTile[aBatchRank]); // M
	tileShape.push_back(aTile[aBatchRank + 1]); // K
	tileShape.push_back(bTile.back()); // N
	tileShape.push_back(kScaleFactor); // S
	return tileShape;
	}

	if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)
	return getTileShape(op->getOpResult(0));

	if (isa<vector::MultiDimReductionOp>(op))
	return getTileShape(op->getOpOperand(0));

	if (isa<vector::TransposeOp, vector::BroadcastOp, vector::StepOp,
	vector::ShapeCastOp, vector::ConstantMaskOp, vector::CreateMaskOp,
	vector::BitCastOp, vector::InterleaveOp, vector::DeinterleaveOp>(op))
	return getTileShape(op->getOpResult(0));

	return std::nullopt;
	}

	bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
	// skip the op if any of its operands or results has workgroup level layouts
	bool hasWgLayoutOperands =
	llvm::any_of(op->getOpOperands(), [](OpOperand &opr) {
	xegpu::DistributeLayoutAttr layout =
	xegpu::getDistributeLayoutAttr(opr);
	return layout && layout.isForWorkgroup();
	});
	bool hasWgLayoutResults =
	llvm::any_of(op->getOpResults(), [](OpResult result) {
	xegpu::DistributeLayoutAttr layout =
	xegpu::getDistributeLayoutAttr(result);
	return layout && layout.isForWorkgroup();
	});
	if (hasWgLayoutOperands \|\| hasWgLayoutResults) {
	LDBG() << "skip unrolling for op with workgroup level layout: " << *op;
	return false;
	}

	auto isUnrollable = [](Value value, ArrayRef<int64_t> tileShape) {
	Type valTy = value.getType();
	if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(valTy)) {
	xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr();
	return layout && !layout.getEffectiveInstDataAsInt().empty();
	}
	auto shapedType = dyn_cast<ShapedType>(valTy);
	return shapedType && !llvm::equal(tileShape, shapedType.getShape());
	};

	bool hasUnrollableOperands =
	llvm::any_of(op->getOpOperands(), [&](OpOperand &opr) {
	std::optional<SmallVector<int64_t>> tileShape = getTileShape(opr);
	return tileShape.has_value() && isUnrollable(opr.get(), *tileShape);
	});
	bool hasUnrollableResults =
	llvm::any_of(op->getOpResults(), [&](OpResult result) {
	std::optional<SmallVector<int64_t>> tileShape = getTileShape(result);
	return tileShape.has_value() && isUnrollable(result, *tileShape);
	});
	// ConvertLayoutOp must be processed to drop the inst_data in the layout
	bool isConvertLayoutWithInstData = false;
	if (auto convertLayoutOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
	auto targettLayout = convertLayoutOp.getTargetLayout();
	if (targettLayout && !targettLayout.getEffectiveInstDataAsInt().empty()) {
	isConvertLayoutWithInstData = true;
	}
	}
	return hasUnrollableOperands \|\| hasUnrollableResults \|\|
	isConvertLayoutWithInstData;
	}

	void XeGPUBlockingPass::runOnOperation() {
	MLIRContext *ctx = &getContext();
	Operation *op = getOperation();

	if (!xegpu::recoverTemporaryLayouts(op)) {
	signalPassFailure();
	return;
	}

	auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
	xegpu::DistributeLayoutAttr layout) {
	int count = 1;
	SmallVector<int64_t> tileShape(shape);
	if (layout && !layout.getEffectiveInstDataAsInt().empty()) {
	tileShape = layout.getEffectiveInstDataAsInt();
	count = computeProduct(shape) / computeProduct(tileShape);
	}
	assert(count >= 1 && "count must be at least 1");
	return std::make_pair(tileShape, count);
	};

	// Perform context-aware type conversion for SCF structural ops.
	// Inspects Values to find inst_data layout information for 1:N conversion.
	llvm::SmallSetVector<UnrealizedConversionCastOp, 8> existingCasts;
	op->walk(
	[&](UnrealizedConversionCastOp castOp) { existingCasts.insert(castOp); });

	{
	TypeConverter converter;
	converter.addConversion([](Type type) -> Type { return type; });

	// TensorDescType 1:N converter (type-based, layout is in the type).
	converter.addConversion(
	[&](xegpu::TensorDescType type,
	SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
	Type elemTy = type.getElementType();
	ArrayRef<int64_t> shape = type.getShape();

	xegpu::DistributeLayoutAttr layout = type.getLayoutAttr();
	if (layout && layout.isForWorkgroup())
	return failure();

	int count;
	SmallVector<int64_t> subShape;
	std::tie(subShape, count) = getTileShapeAndCount(shape, layout);

	if (layout)
	layout = layout.dropInstData();

	auto newTy = xegpu::TensorDescType::get(
	type.getContext(), subShape, elemTy, type.getEncoding(), layout);
	result.append(count, newTy);
	return success();
	});

	// Context-aware VectorType conversion based on inst_data (1:1
	// shape-changing or 1:N).
	auto getSubShapeAndCount = [&](VectorType vecTy,
	xegpu::DistributeLayoutAttr layout)
	-> std::pair<SmallVector<int64_t>, int> {
	return getTileShapeAndCount(vecTy.getShape(), layout);
	};
	auto loopArgTypes =
	xegpu::precomputeLoopBlockArgTypes(op, getSubShapeAndCount);
	xegpu::addVectorTypeConversion(converter, getSubShapeAndCount,
	std::move(loopArgTypes));

	// Loop-carried types are now in the converter's map, so the transient
	// per-position layout attrs on SCF loop ops are no longer needed. Strip
	// them before converting: the SCF converters copy old attrs onto the new
	// op (ConvertForOpTypes::setAttrs), and after 1:N result expansion a stale
	// `layout_result_N` lands on the wrong (renumbered) result, corrupting the
	// count invariant and leaving the loop illegal.
	op->walk([](Operation *loopOp) {
	if (!isa<scf::ForOp, scf::WhileOp, scf::ConditionOp>(loopOp))
	return;
	SmallVector<StringRef> toRemove;
	for (const NamedAttribute &attr : loopOp->getAttrs()) {
	StringRef name = attr.getName().strref();
	if (name.starts_with("layout_operand_") \|\|
	name.starts_with("layout_result_"))
	toRemove.push_back(name);
	}
	for (StringRef name : toRemove)
	loopOp->removeAttr(name);
	});

	// Source (N:1) and target (1:1) materializations using
	// UnrealizedConversionCastOp.
	auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,
	Location loc) -> Value {
	return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
	.getResult(0);
	};
	converter.addSourceMaterialization(materializeCast);
	converter.addTargetMaterialization(materializeCast);
	// Blocking runs SCF conversion separately (not combined with XeGPU
	// patterns), so it also needs a 1:N target materialization.
	converter.addTargetMaterialization(
	[](mlir::OpBuilder &builder, mlir::TypeRange types,
	mlir::ValueRange inputs, mlir::Location loc) -> SmallVector<Value> {
	auto castOp =
	UnrealizedConversionCastOp::create(builder, loc, types, inputs);
	return SmallVector<Value>(castOp.getResults());
	});

	ConversionTarget target(*ctx);
	target.addLegalOp<UnrealizedConversionCastOp>();
	target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });

	RewritePatternSet scfPatterns(ctx);
	scf::populateSCFStructuralTypeConversionsAndLegality(converter, scfPatterns,
	target);
	if (failed(applyPartialConversion(op, target, std::move(scfPatterns))))
	return signalPassFailure();

	// Fold cancelling cast chains and erase dead casts.
	xegpu::cleanupUnrealizedConversionCasts(op, existingCasts);
	}

	xegpu::UnrollOptions options;
	options.setFilterConstraint(
	[&](Operation *op) -> LogicalResult { return success(needsUnroll(op)); });

	options.setNativeShapeFn([&](Operation *op) { return getTileShape(op); });

	options.setUnrolledTypesFn([&](ShapedType type, ArrayRef<int64_t> tileShape) {
	Type elemTy = type.getElementType();

	if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type)) {

	Attribute encoding = tdescTy.getEncoding();

	xegpu::TensorDescType newTy =
	xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
	tdescTy.getLayoutAttr().dropInstData());
	// Compute the product of batch (higher) dimensions.
	ArrayRef<int64_t> shape = type.getShape();
	int64_t batchCount =
	shape.size() > 2 ? computeProduct(shape.drop_back(2)) : 1;
	return SmallVector<Type>(batchCount, newTy);
	}
	Type newTy = VectorType::get(tileShape, elemTy);

	std::optional<SmallVector<int64_t>> ratio =
	computeShapeRatio(type.getShape(), tileShape);
	assert(ratio && "The shape of the type must be a multiple of tileShape.");
	return SmallVector<Type>(computeProduct(*ratio), newTy);
	});

	RewritePatternSet patterns(ctx);
	vector::UnrollVectorOptions vectorOptions;
	vectorOptions.setNativeShapeFn(options.nativeShape);

	populateXeGPUUnrollPatterns(patterns, options);
	vector::populateVectorUnrollPatterns(patterns, vectorOptions);

	// Note: The pattern driver does op folding as well and clean up.
	// But intermediate insert/extract strided slice ops with
	// unrealized conversion cast ops in the middle does not get
	// cleaned up in this step. One more round of folding is needed
	// after the walk to resolve those unrealized conversion cast ops.
	(void)applyPatternsGreedily(op, std::move(patterns));

	op->walk([](Operation *op) {
	// Remove the layout attributes cached per operands.
	for (OpOperand &opr : op->getOpOperands()) {
	std::string name = xegpu::getTemporaryLayoutName(opr);
	if (op->hasAttrOfType<xegpu::DistributeLayoutAttr>(name))
	op->removeAttr(name);
	}

	// Update the layout attributes per result.
	for (OpResult result : op->getOpResults()) {
	std::string name = xegpu::getTemporaryLayoutName(result);
	if (auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
	op->removeAttr(name);
	if (!isa<LoopLikeOpInterface>(op))
	xegpu::setDistributeLayoutAttr(result, layout.dropInstData());
	}
	}

	// Drop left-over inst_data if the unroll pattern does not being applied,
	// say, inst_data just matches their shape.
	SmallVector<NamedAttribute> newAttrs =
	xegpu::dropInstDataOnAttrs(op->getAttrs());
	op->setAttrs(newAttrs);
	});

	// Resolve UnrealizedConversionCastOps generated by SCF structural type
	// conversion and by XeGPU/Vector unrolling (cancelling cast chains and
	// unpaired pack/unpack casts).
	xegpu::cleanupUnrealizedConversionCasts(op, existingCasts);

	// One more round of folding to clean up the intermediate
	// insert/extract strided slice ops.
	RewritePatternSet emptyPatterns(ctx);
	(void)applyPatternsGreedily(op, std::move(emptyPatterns));
	}