mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp - llvm-project - Git at Google

 //===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/Dialect/XeGPU/uArch/uArchBase.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include <optional>

 namespace mlir {
 namespace xegpu {
 #define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
 } // namespace xegpu
 } // namespace mlir

 #define DEBUG_TYPE "xegpu-optimize-block-loads"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

 using namespace mlir;

 namespace {

 /// Get the 2D lane data from a tensor desc type if it exists.
 static std::optional<SmallVector<int64_t>>
 getMaybeLaneData(xegpu::TensorDescType tdescType) {
   auto layout = tdescType.getLayoutAttr();
   if (!layout)
     return std::nullopt;
   auto laneData = layout.getEffectiveLaneDataAsInt();
   if (laneData.size() != 2)
     return std::nullopt;
   return laneData;
 }

 /// Get the 2D lane layout from a tensor desc type if it exists.
 static std::optional<SmallVector<int64_t>>
 getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
   auto layout = tdescType.getLayoutAttr();
   if (!layout)
     return std::nullopt;
   auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
   if (laneLayout.size() != 2)
     return std::nullopt;
   return laneLayout;
 }

 /// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
 /// lane[1] == 1), but inner lane data is not equal to [1, 1].
 /// Example:
 ///     !xegpu.tensor_desc<16x16xf16,
 ///         #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
 /// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
 /// indicating that this is a load that requires transpose effect. However,
 /// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
 /// the inner dimension. We convert this to a optimized form by converting the
 /// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
 /// later lowering easily use the load with transpose instruction.
 static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
                                        ArrayRef<int64_t> laneData) {
   if (laneLayout.size() != 2 || laneData.size() != 2)
     return false;
   if (laneLayout[0] == 1 || laneLayout[1] != 1)
     return false;
   if (laneData[0] != 1 || laneData[1] == 1)
     return false;
   return true;
 }

 /// A tensor desc type can be optimized if its element type is less than 32 bits
 /// and its layout can be optimized.
 static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
   // If the dtype is greater or equal to 32 bits, layout must be valid.
   int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
   if (elementTyBitwidth >= 32)
     return false;
   auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
   auto maybeLaneData = getMaybeLaneData(tdescType);
   if (!maybeLaneData || !maybeLaneLayout)
     return false;
   return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
 }

 /// Check if a tensor desc type can be optimized for transpose, if so return the
 /// new optimized tensor desc type with a valid transpose layout.
 static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
                                          const uArch *targetuArch) {
   if (!canBeOptimizedForTranspose(tdescType))
     return tdescType;
   auto laneData = getMaybeLaneData(tdescType)
                       .value(); // Lane data must exist if we reach here.
   int64_t innerLaneData = laneData[1];
   int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
   // Required shape is total shape of the vector result that this tensor desc
   // must eventually load after adjusting for the new bitwidth and array
   // length.
   SmallVector<int64_t> requiredShape(tdescType.getShape());
   requiredShape.back() =
       requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
   int newBitWidth = elementTyBitwidth * innerLaneData;
   Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
   // Supported shape is the max transpose shape that can be supported by
   // hardware that is less than or equal to required shape.
   auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
       targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad));
   auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
       newElemTy, /** has transform */ false, /** has transpose */ true);
   // If no HW params found, return the original type.
   if (!maybeHWParams)
     return tdescType;
   auto [widths, heights, counts] = maybeHWParams.value();
   // TODO: Currently we expect array length to be 1 for transpose case.
   if (counts.size() != 1 || counts[0] != 1)
     return tdescType;
   int arrayLen = counts[0];
   int supportedHeight =
       xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
   int supportedWidth =
       xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
   // If no supported height or width found, return the original type.
   if (supportedHeight == -1 || supportedWidth == -1)
     return tdescType;

   SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
   xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
       tdescType.getContext(),
       tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1});
   // Array length can not be larger than 1 for transpose case.
   return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
                                     tdescType.getBoundaryCheck(),
                                     tdescType.getMemorySpace(), newLayout);
 }

 /// Helper to convert an OpFoldResult to Value.
 static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
                             OpFoldResult ofr) {
   std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
   if (mayBeInt)
     return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
   return llvm::cast<Value>(ofr);
 }

 /// Helper to divide a Value by a constant integer.
 static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
                               Value val, int64_t constant) {
   // If the constant is a power of 2, use right shift for division.
   if (llvm::isPowerOf2_64(constant)) {
     int64_t shiftAmount = llvm::Log2_64(constant);
     return arith::ShRUIOp::create(
                rewriter, loc, val,
                arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
                    .getResult())
         .getResult();
   }
   auto constantOp =
       arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
   return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
 }

 /// This function takes a larger register block `data` and generates multiple
 /// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
 /// starting from `offsets`.
 static Value generateLoads(ConversionPatternRewriter &rewriter,
                            TypedValue<VectorType> data,
                            SmallVector<OpFoldResult> offsets,
                            TypedValue<xegpu::TensorDescType> newTensorDesc,
                            xegpu::LoadNdOp origLoadOp) {
   Location loc = data.getLoc();
   assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
   Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
   Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
   SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
   // Compute the ratio between original shape and supported shape. We need to
   // generate loads in this ratio arrangement.
   auto shapeRatio = computeShapeRatio(data.getType().getShape(),
                                       supportedShape)
                         .value(); // `ratio` must be defined if we reach here.
   for (int64_t h = 0; h < shapeRatio[0]; ++h) {
     for (int64_t w = 0; w < shapeRatio[1]; ++w) {
       int64_t localOffsetDim0 = h * supportedShape[0];
       int64_t localOffsetDim1 = w * supportedShape[1];
       Value loadOffsetX = arith::AddIOp::create(
           rewriter, loc, offsetDim0,
           arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
               .getResult());
       Value loadOffsetY = arith::AddIOp::create(
           rewriter, loc, offsetDim1,
           arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
               .getResult());
       auto loadOp = xegpu::LoadNdOp::create(
           rewriter, loc,
           VectorType::get(supportedShape, data.getType().getElementType()),
           newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
           origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
           origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
           origLoadOp.getL3HintAttr());
       // Set the layout for the loadOp.
       auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
       xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr);
       // Insert the loaded block into the right position in data.
       auto insertOp = vector::InsertStridedSliceOp::create(
           rewriter, loc, loadOp.getResult(), data,
           ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
           ArrayRef<int64_t>{1, 1});
       // InsertOp must have the same layout as newTensorDesc.
       xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr);
       data = insertOp.getResult();
     }
   }
   return data;
 }

 /// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
 /// new CreateNdDescOp with optimized tensor desc type. This involves extracting
 /// the base pointer from the original memory source and adjusting the shape and
 /// strides of the tensor desc to fit with the new optimized transpose layout.
 class XeGPUCreateNdDescOpPattern final
     : public OpConversionPattern<xegpu::CreateNdDescOp> {
 public:
   using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto tdescTy = createNdOp.getType();
     // Get the target uArch info.
     auto chipStr = xegpu::getChipStr(createNdOp);
     // Check if the chip is supported.
     assert(
         chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") &&
         "Expecting target chip to be pvc or bmg for transpose optimization.");
     const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());

     auto convertType = tryOptimize(tdescTy, targetuArch);
     if (convertType == tdescTy)
       return failure();
     auto strides = createNdOp.getMixedStrides();
     auto maybeConstInnerStride = getConstantIntValue(strides.back());
     // Only row-major memrefs are expected for now.
     if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
       return rewriter.notifyMatchFailure(
           createNdOp, "Expecting row-major memref for transpose optimization.");
     Value source = createNdOp.getSource();
     auto optionalLaneData = getMaybeLaneData(tdescTy);
     assert(optionalLaneData && "Expected 2D lane data");
     auto laneData = optionalLaneData.value();
     int64_t innerLaneData = laneData[1];
     auto memrefType = dyn_cast<MemRefType>(source.getType());
     // Inner dimension of the shape must be adjusted based on innerLaneData.
     SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
     modifiedShape.back() = divideByConstant(
         rewriter, createNdOp.getLoc(),
         convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
         innerLaneData);
     // Similarly, second to last stride must be adjusted.
     assert(strides.size() >= 2 &&
            "Expected at least 2 strides for CreateNdDescOp");
     SmallVector<OpFoldResult> modifiedStrides(strides);
     modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
         rewriter, createNdOp.getLoc(),
         convertToValue(rewriter, createNdOp.getLoc(),
                        modifiedStrides[modifiedStrides.size() - 2]),
         innerLaneData);

     // If the source is a static memref, we need to extract the pointer to
     // base address.
     if (memrefType && memrefType.hasStaticShape()) {
       auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
           rewriter, createNdOp.getLoc(), source);
       source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
                                           rewriter.getI64Type(),
                                           extractOp.getResult())
                    .getResult();
     }
     // Create a new CreateNdDescOp with the modified shape and converted type.
     auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
         rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
         modifiedStrides);
     rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
     return success();
   }
 };

 /// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
 /// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
 /// adjusted tensor desc type. This can result in multiple LoadNdOps being
 /// generated to fill in the original load shape.
 class XeGPULoadNdDescOpPattern final
     : public OpConversionPattern<xegpu::LoadNdOp> {
 public:
   using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto origTensorDescType = loadNdOp.getTensorDescType();
     auto adaptorType =
         cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
     if (adaptorType == origTensorDescType)
       return failure();
     // Offsets must be adjusted based on innerLaneData.
     auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
     int64_t innerLaneData = laneData[1];
     auto offsets = loadNdOp.getMixedOffsets();
     if (offsets.empty())
       return rewriter.notifyMatchFailure(loadNdOp,
                                          "Expecting offsets in LoadNd");
     SmallVector<OpFoldResult> modifiedOffsets(offsets);
     modifiedOffsets.back() = divideByConstant(
         rewriter, loadNdOp.getLoc(),
         convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
         innerLaneData);
     // Get the 2D data shape of this loadNdOp in its original type including
     // array length.
     SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
     // Adjust the data shape based on innerLaneData.
     origDataShape.back() /= innerLaneData;
     // HW supported shape is the new tensor desc shape after conversion.
     SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
     VectorType origVectorType =
         VectorType::get(origDataShape, adaptorType.getElementType());
     Value data;
     // Orig data shape is 3D for the array length case.
     if (origTensorDescType.getArrayLength() > 1) {
       SmallVector<Value> arraySlices;
       for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
         Value slice = arith::ConstantOp::create(
             rewriter, loadNdOp->getLoc(), origVectorType,
             rewriter.getZeroAttr(origVectorType));
         // Increase the Y offset for each array slice.
         Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
                                        modifiedOffsets.back());
         modifiedOffsets.back() =
             arith::AddIOp::create(
                 rewriter, loadNdOp->getLoc(), offsetY,
                 arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
                                                i * origDataShape[1])
                     .getResult())
                 .getResult();
         slice = generateLoads(
             rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
             cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
             loadNdOp);
         // BitCast back to original load shape without array length.
         auto bitcastType = VectorType::get(origTensorDescType.getShape(),
                                            origTensorDescType.getElementType());
         auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
                                                    bitcastType, slice);
         // BitCastOp must have the same layout as the original loadNdOp.
         xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
                                        origTensorDescType.getLayoutAttr());
         arraySlices.push_back(bitCastOp.getResult());
       }
       rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
       return success();
     }
     data = arith::ConstantOp::create(
         rewriter, loadNdOp->getLoc(),
         VectorType::get(origDataShape, adaptorType.getElementType()),
         rewriter.getZeroAttr(origVectorType));
     data = generateLoads(
         rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
         cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
         loadNdOp);
     auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
                                                loadNdOp.getType(), data);
     // BitCastOp must have the same layout as the original loadNdOp.
     xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
                                    origTensorDescType.getLayoutAttr());
     rewriter.replaceOp(loadNdOp, bitCastOp);
     return success();
   }
 };

 /// Vector ExtractOp must be processed if the original tensor desc type has
 /// array length greater than 1. In this case, the LoadNdOp is replaced with
 /// multiple LoadNdOps for each array slice making the extraction unnecessary.
 /// In this case, we simply remove the ExtractOp.
 class VectorExtractOpPattern final
     : public OpConversionPattern<vector::ExtractOp> {
 public:
   using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // Check if the source of the extraction is split to multiple values.
     if (adaptor.getSource().size() == 1)
       return failure();
     auto mixedPos = extractOp.getMixedPosition();
     if (mixedPos.size() != 1)
       return failure();
     auto mayBeInt = getConstantIntValue(mixedPos[0]);
     if (!mayBeInt)
       return failure();
     rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
     return success();
   }
 };

 } // namespace

 void xegpu::populateXeGPUOptimizeBlockLoadsPatterns(
     RewritePatternSet &patterns) {
   patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
                VectorExtractOpPattern>(patterns.getContext());
 }

 namespace {

 struct XeGPUOptimizeBlockLoadsPass final
     : public xegpu::impl::XeGPUOptimizeBlockLoadsBase<
           XeGPUOptimizeBlockLoadsPass> {
   void runOnOperation() override {
     MLIRContext &context = getContext();
     TypeConverter converter;
     RewritePatternSet patterns(&context);
     ConversionTarget target(context);

     // This pass is only meant for PVC and BMG targets. If unsupported target
     // is found, exit early.
     bool isTargetSupported = false;
     getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
       auto chipStr = xegpu::getChipStr(funcOp);
       if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg"))
         isTargetSupported = true;
     });

     if (!isTargetSupported) {
       DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets."
              << "\n";
       return;
     }

     // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
     // converted.
     target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
         [&](xegpu::CreateNdDescOp createNdOp) {
           return !canBeOptimizedForTranspose(createNdOp.getType());
         });
     target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
         [&](xegpu::LoadNdOp loadNdOp) {
           return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
         });
     // Vector ExtractOps can have optimizable layouts if they extract from
     // LoadNdOps with array length greater than 1. These ExtractOps must be
     // converted.
     target.addDynamicallyLegalOp<vector::ExtractOp>(
         [&](vector::ExtractOp extractOp) {
           auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult());
           if (!layout)
             return true;
           auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
           auto laneData = layout.getEffectiveLaneDataAsInt();
           return !canBeOptimizedForTranspose(laneLayout, laneData);
         });
     converter.addConversion([](Type type) { return type; });

     target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
                            vector::VectorDialect>();
     scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
                                                          target);
     xegpu::populateXeGPUOptimizeBlockLoadsPatterns(patterns);
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns)))) {
       DBGS() << "Optimize block loads pass failed.\n";
       return signalPassFailure();
     }
   }
 };

 } // namespace
	//===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/Arith/IR/Arith.h"
	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
	#include "mlir/Dialect/MemRef/IR/MemRef.h"
	#include "mlir/Dialect/SCF/Transforms/Patterns.h"
	#include "mlir/Dialect/Utils/IndexingUtils.h"
	#include "mlir/Dialect/Utils/StaticValueUtils.h"
	#include "mlir/Dialect/Vector/IR/VectorOps.h"
	#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
	#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
	#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
	#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
	#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
	#include "mlir/Dialect/XeGPU/uArch/uArchBase.h"
	#include "mlir/IR/BuiltinTypes.h"
	#include "mlir/IR/OpDefinition.h"
	#include "mlir/IR/Types.h"
	#include "mlir/IR/Value.h"
	#include "mlir/Transforms/DialectConversion.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include <optional>

	namespace mlir {
	namespace xegpu {
	#define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS
	#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
	} // namespace xegpu
	} // namespace mlir

	#define DEBUG_TYPE "xegpu-optimize-block-loads"
	#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

	using namespace mlir;

	namespace {

	/// Get the 2D lane data from a tensor desc type if it exists.
	static std::optional<SmallVector<int64_t>>
	getMaybeLaneData(xegpu::TensorDescType tdescType) {
	auto layout = tdescType.getLayoutAttr();
	if (!layout)
	return std::nullopt;
	auto laneData = layout.getEffectiveLaneDataAsInt();
	if (laneData.size() != 2)
	return std::nullopt;
	return laneData;
	}

	/// Get the 2D lane layout from a tensor desc type if it exists.
	static std::optional<SmallVector<int64_t>>
	getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
	auto layout = tdescType.getLayoutAttr();
	if (!layout)
	return std::nullopt;
	auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
	if (laneLayout.size() != 2)
	return std::nullopt;
	return laneLayout;
	}

	/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
	/// lane[1] == 1), but inner lane data is not equal to [1, 1].
	/// Example:
	/// !xegpu.tensor_desc<16x16xf16,
	/// #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
	/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
	/// indicating that this is a load that requires transpose effect. However,
	/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
	/// the inner dimension. We convert this to a optimized form by converting the
	/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
	/// later lowering easily use the load with transpose instruction.
	static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
	ArrayRef<int64_t> laneData) {
	if (laneLayout.size() != 2 \|\| laneData.size() != 2)
	return false;
	if (laneLayout[0] == 1 \|\| laneLayout[1] != 1)
	return false;
	if (laneData[0] != 1 \|\| laneData[1] == 1)
	return false;
	return true;
	}

	/// A tensor desc type can be optimized if its element type is less than 32 bits
	/// and its layout can be optimized.
	static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
	// If the dtype is greater or equal to 32 bits, layout must be valid.
	int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
	if (elementTyBitwidth >= 32)
	return false;
	auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
	auto maybeLaneData = getMaybeLaneData(tdescType);
	if (!maybeLaneData \|\| !maybeLaneLayout)
	return false;
	return canBeOptimizedForTranspose(maybeLaneLayout, maybeLaneData);
	}

	/// Check if a tensor desc type can be optimized for transpose, if so return the
	/// new optimized tensor desc type with a valid transpose layout.
	static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
	const uArch *targetuArch) {
	if (!canBeOptimizedForTranspose(tdescType))
	return tdescType;
	auto laneData = getMaybeLaneData(tdescType)
	.value(); // Lane data must exist if we reach here.
	int64_t innerLaneData = laneData[1];
	int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
	// Required shape is total shape of the vector result that this tensor desc
	// must eventually load after adjusting for the new bitwidth and array
	// length.
	SmallVector<int64_t> requiredShape(tdescType.getShape());
	requiredShape.back() =
	requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
	int newBitWidth = elementTyBitwidth * innerLaneData;
	Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
	// Supported shape is the max transpose shape that can be supported by
	// hardware that is less than or equal to required shape.
	auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
	targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad));
	auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
	newElemTy, /** has transform / false, /* has transpose */ true);
	// If no HW params found, return the original type.
	if (!maybeHWParams)
	return tdescType;
	auto [widths, heights, counts] = maybeHWParams.value();
	// TODO: Currently we expect array length to be 1 for transpose case.
	if (counts.size() != 1 \|\| counts[0] != 1)
	return tdescType;
	int arrayLen = counts[0];
	int supportedHeight =
	xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
	int supportedWidth =
	xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
	// If no supported height or width found, return the original type.
	if (supportedHeight == -1 \|\| supportedWidth == -1)
	return tdescType;

	SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
	xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
	tdescType.getContext(),
	tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1});
	// Array length can not be larger than 1 for transpose case.
	return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
	tdescType.getBoundaryCheck(),
	tdescType.getMemorySpace(), newLayout);
	}

	/// Helper to convert an OpFoldResult to Value.
	static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
	OpFoldResult ofr) {
	std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
	if (mayBeInt)
	return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
	return llvm::cast<Value>(ofr);
	}

	/// Helper to divide a Value by a constant integer.
	static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
	Value val, int64_t constant) {
	// If the constant is a power of 2, use right shift for division.
	if (llvm::isPowerOf2_64(constant)) {
	int64_t shiftAmount = llvm::Log2_64(constant);
	return arith::ShRUIOp::create(
	rewriter, loc, val,
	arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
	.getResult())
	.getResult();
	}
	auto constantOp =
	arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
	return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
	}

	/// This function takes a larger register block `data` and generates multiple
	/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
	/// starting from `offsets`.
	static Value generateLoads(ConversionPatternRewriter &rewriter,
	TypedValue<VectorType> data,
	SmallVector<OpFoldResult> offsets,
	TypedValue<xegpu::TensorDescType> newTensorDesc,
	xegpu::LoadNdOp origLoadOp) {
	Location loc = data.getLoc();
	assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
	Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
	Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
	SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
	// Compute the ratio between original shape and supported shape. We need to
	// generate loads in this ratio arrangement.
	auto shapeRatio = computeShapeRatio(data.getType().getShape(),
	supportedShape)
	.value(); // `ratio` must be defined if we reach here.
	for (int64_t h = 0; h < shapeRatio[0]; ++h) {
	for (int64_t w = 0; w < shapeRatio[1]; ++w) {
	int64_t localOffsetDim0 = h * supportedShape[0];
	int64_t localOffsetDim1 = w * supportedShape[1];
	Value loadOffsetX = arith::AddIOp::create(
	rewriter, loc, offsetDim0,
	arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
	.getResult());
	Value loadOffsetY = arith::AddIOp::create(
	rewriter, loc, offsetDim1,
	arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
	.getResult());
	auto loadOp = xegpu::LoadNdOp::create(
	rewriter, loc,
	VectorType::get(supportedShape, data.getType().getElementType()),
	newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
	origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
	origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
	origLoadOp.getL3HintAttr());
	// Set the layout for the loadOp.
	auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
	xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr);
	// Insert the loaded block into the right position in data.
	auto insertOp = vector::InsertStridedSliceOp::create(
	rewriter, loc, loadOp.getResult(), data,
	ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
	ArrayRef<int64_t>{1, 1});
	// InsertOp must have the same layout as newTensorDesc.
	xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr);
	data = insertOp.getResult();
	}
	}
	return data;
	}

	/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
	/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
	/// the base pointer from the original memory source and adjusting the shape and
	/// strides of the tensor desc to fit with the new optimized transpose layout.
	class XeGPUCreateNdDescOpPattern final
	: public OpConversionPattern<xegpu::CreateNdDescOp> {
	public:
	using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
	LogicalResult
	matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
	ConversionPatternRewriter &rewriter) const override {
	auto tdescTy = createNdOp.getType();
	// Get the target uArch info.
	auto chipStr = xegpu::getChipStr(createNdOp);
	// Check if the chip is supported.
	assert(
	chipStr && (chipStr.value() == "pvc" \|\| chipStr.value() == "bmg") &&
	"Expecting target chip to be pvc or bmg for transpose optimization.");
	const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());

	auto convertType = tryOptimize(tdescTy, targetuArch);
	if (convertType == tdescTy)
	return failure();
	auto strides = createNdOp.getMixedStrides();
	auto maybeConstInnerStride = getConstantIntValue(strides.back());
	// Only row-major memrefs are expected for now.
	if (!maybeConstInnerStride \|\| *maybeConstInnerStride != 1)
	return rewriter.notifyMatchFailure(
	createNdOp, "Expecting row-major memref for transpose optimization.");
	Value source = createNdOp.getSource();
	auto optionalLaneData = getMaybeLaneData(tdescTy);
	assert(optionalLaneData && "Expected 2D lane data");
	auto laneData = optionalLaneData.value();
	int64_t innerLaneData = laneData[1];
	auto memrefType = dyn_cast<MemRefType>(source.getType());
	// Inner dimension of the shape must be adjusted based on innerLaneData.
	SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
	modifiedShape.back() = divideByConstant(
	rewriter, createNdOp.getLoc(),
	convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
	innerLaneData);
	// Similarly, second to last stride must be adjusted.
	assert(strides.size() >= 2 &&
	"Expected at least 2 strides for CreateNdDescOp");
	SmallVector<OpFoldResult> modifiedStrides(strides);
	modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
	rewriter, createNdOp.getLoc(),
	convertToValue(rewriter, createNdOp.getLoc(),
	modifiedStrides[modifiedStrides.size() - 2]),
	innerLaneData);

	// If the source is a static memref, we need to extract the pointer to
	// base address.
	if (memrefType && memrefType.hasStaticShape()) {
	auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
	rewriter, createNdOp.getLoc(), source);
	source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
	rewriter.getI64Type(),
	extractOp.getResult())
	.getResult();
	}
	// Create a new CreateNdDescOp with the modified shape and converted type.
	auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
	rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
	modifiedStrides);
	rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
	return success();
	}
	};

	/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
	/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
	/// adjusted tensor desc type. This can result in multiple LoadNdOps being
	/// generated to fill in the original load shape.
	class XeGPULoadNdDescOpPattern final
	: public OpConversionPattern<xegpu::LoadNdOp> {
	public:
	using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
	LogicalResult
	matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
	ConversionPatternRewriter &rewriter) const override {
	auto origTensorDescType = loadNdOp.getTensorDescType();
	auto adaptorType =
	cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
	if (adaptorType == origTensorDescType)
	return failure();
	// Offsets must be adjusted based on innerLaneData.
	auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
	int64_t innerLaneData = laneData[1];
	auto offsets = loadNdOp.getMixedOffsets();
	if (offsets.empty())
	return rewriter.notifyMatchFailure(loadNdOp,
	"Expecting offsets in LoadNd");
	SmallVector<OpFoldResult> modifiedOffsets(offsets);
	modifiedOffsets.back() = divideByConstant(
	rewriter, loadNdOp.getLoc(),
	convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
	innerLaneData);
	// Get the 2D data shape of this loadNdOp in its original type including
	// array length.
	SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
	// Adjust the data shape based on innerLaneData.
	origDataShape.back() /= innerLaneData;
	// HW supported shape is the new tensor desc shape after conversion.
	SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
	VectorType origVectorType =
	VectorType::get(origDataShape, adaptorType.getElementType());
	Value data;
	// Orig data shape is 3D for the array length case.
	if (origTensorDescType.getArrayLength() > 1) {
	SmallVector<Value> arraySlices;
	for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
	Value slice = arith::ConstantOp::create(
	rewriter, loadNdOp->getLoc(), origVectorType,
	rewriter.getZeroAttr(origVectorType));
	// Increase the Y offset for each array slice.
	Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
	modifiedOffsets.back());
	modifiedOffsets.back() =
	arith::AddIOp::create(
	rewriter, loadNdOp->getLoc(), offsetY,
	arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
	i * origDataShape[1])
	.getResult())
	.getResult();
	slice = generateLoads(
	rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
	cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
	loadNdOp);
	// BitCast back to original load shape without array length.
	auto bitcastType = VectorType::get(origTensorDescType.getShape(),
	origTensorDescType.getElementType());
	auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
	bitcastType, slice);
	// BitCastOp must have the same layout as the original loadNdOp.
	xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
	origTensorDescType.getLayoutAttr());
	arraySlices.push_back(bitCastOp.getResult());
	}
	rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
	return success();
	}
	data = arith::ConstantOp::create(
	rewriter, loadNdOp->getLoc(),
	VectorType::get(origDataShape, adaptorType.getElementType()),
	rewriter.getZeroAttr(origVectorType));
	data = generateLoads(
	rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
	cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
	loadNdOp);
	auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
	loadNdOp.getType(), data);
	// BitCastOp must have the same layout as the original loadNdOp.
	xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
	origTensorDescType.getLayoutAttr());
	rewriter.replaceOp(loadNdOp, bitCastOp);
	return success();
	}
	};

	/// Vector ExtractOp must be processed if the original tensor desc type has
	/// array length greater than 1. In this case, the LoadNdOp is replaced with
	/// multiple LoadNdOps for each array slice making the extraction unnecessary.
	/// In this case, we simply remove the ExtractOp.
	class VectorExtractOpPattern final
	: public OpConversionPattern<vector::ExtractOp> {
	public:
	using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
	LogicalResult
	matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
	ConversionPatternRewriter &rewriter) const override {
	// Check if the source of the extraction is split to multiple values.
	if (adaptor.getSource().size() == 1)
	return failure();
	auto mixedPos = extractOp.getMixedPosition();
	if (mixedPos.size() != 1)
	return failure();
	auto mayBeInt = getConstantIntValue(mixedPos[0]);
	if (!mayBeInt)
	return failure();
	rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
	return success();
	}
	};

	} // namespace

	void xegpu::populateXeGPUOptimizeBlockLoadsPatterns(
	RewritePatternSet &patterns) {
	patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
	VectorExtractOpPattern>(patterns.getContext());
	}

	namespace {

	struct XeGPUOptimizeBlockLoadsPass final
	: public xegpu::impl::XeGPUOptimizeBlockLoadsBase<
	XeGPUOptimizeBlockLoadsPass> {
	void runOnOperation() override {
	MLIRContext &context = getContext();
	TypeConverter converter;
	RewritePatternSet patterns(&context);
	ConversionTarget target(context);

	// This pass is only meant for PVC and BMG targets. If unsupported target
	// is found, exit early.
	bool isTargetSupported = false;
	getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
	auto chipStr = xegpu::getChipStr(funcOp);
	if (chipStr && (chipStr.value() == "pvc" \|\| chipStr.value() == "bmg"))
	isTargetSupported = true;
	});

	if (!isTargetSupported) {
	DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets."
	<< "\n";
	return;
	}

	// CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
	// converted.
	target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
	[&](xegpu::CreateNdDescOp createNdOp) {
	return !canBeOptimizedForTranspose(createNdOp.getType());
	});
	target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
	[&](xegpu::LoadNdOp loadNdOp) {
	return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
	});
	// Vector ExtractOps can have optimizable layouts if they extract from
	// LoadNdOps with array length greater than 1. These ExtractOps must be
	// converted.
	target.addDynamicallyLegalOp<vector::ExtractOp>(
	[&](vector::ExtractOp extractOp) {
	auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult());
	if (!layout)
	return true;
	auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
	auto laneData = layout.getEffectiveLaneDataAsInt();
	return !canBeOptimizedForTranspose(laneLayout, laneData);
	});
	converter.addConversion([](Type type) { return type; });

	target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
	vector::VectorDialect>();
	scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
	target);
	xegpu::populateXeGPUOptimizeBlockLoadsPatterns(patterns);
	if (failed(applyPartialConversion(getOperation(), target,
	std::move(patterns)))) {
	DBGS() << "Optimize block loads pass failed.\n";
	return signalPassFailure();
	}
	}
	};

	} // namespace