mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp - llvm-project - Git at Google

 //===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps   ------------------===//
 //
 // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements utility methods for working with the XeGPU dialect.
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/XeVMDialect.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <numeric>

 using namespace mlir;

 /// convert ArrayRef<ValueRange> into SmallVector<Value>
 SmallVector<Value> xegpu::flattenValues(ArrayRef<ValueRange> values) {
   SmallVector<Value> result;
   for (const auto &vals : values)
     llvm::append_range(result, vals);
   return result;
 }

 FailureOr<VectorType>
 mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
   auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
   // It only works for subgroup level layout, which only has lane_layout
   // and lane_data, and is to distribute a SIMD code into SIMT code.
   if (!layout || !layout.isForSubgroup())
     return failure();

   SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
   SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
   auto tdescShape = tdescTy.getShape();
   auto elementType = tdescTy.getElementType();

   // compute sgSize by multiply elements of laneLayout
   // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
   // e.g. for 1D layout, sgSize = laneLayout[0]
   int64_t sgSize = llvm::product_of(laneLayout);

   // Check if the tensor descriptor shape is distributable.
   int64_t tensorSize = 1;
   for (auto [tdescDim, laneDim, laneDataDim] :
        llvm::zip_equal(tdescShape, laneLayout, laneData)) {
     assert((tdescDim % (laneDim * laneDataDim) == 0) &&
            "tensor descriptor shape is not distributable");
     tensorSize *= tdescDim;
   }
   // tensorSize must be adjusted for array_length.
   tensorSize *= tdescTy.getArrayLength();

   return VectorType::get({tensorSize / sgSize}, elementType);
 }

 FailureOr<VectorType>
 mlir::xegpu::getDistributedVectorType(VectorType originalType,
                                       xegpu::LayoutAttr layout) {
   int64_t rank = originalType.getRank();
   // Distributed vector type is only supported for 1D, 2D and 3D vectors.
   if (rank < 1 || rank > 3)
     return failure();
   ArrayRef<int64_t> shape = originalType.getShape();
   // arrayLength is 1 for 1D and 2D vectors, and equal to the first dimension
   // of the 3D vector.
   int arrayLength = 1;
   if (rank == 3) {
     arrayLength = shape[0];
     shape = shape.drop_front();
   }
   auto helperTdescTy = xegpu::TensorDescType::get(
       shape, originalType.getElementType(), arrayLength,
       /*boundary_check=*/true,
       /*memory_space=*/xegpu::MemorySpace::Global, layout);
   return xegpu::getDistributedVectorType(helperTdescTy);
 }

 FailureOr<VectorType>
 xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
                                        VectorType originalType) {
   if (!layout)
     return failure();
   assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
          "Expecting a valid layout.");

   int64_t vectorRank = originalType.getRank();
   int64_t layoutRank = layout.getRank();
   assert(vectorRank >= layoutRank && "Vector rank must be >= layout rank.");

   // When the vector has more dimensions than the layout, only the trailing
   // dimensions are distributed. Leading dimensions are preserved as-is.
   int64_t offset = vectorRank - layoutRank;
   ArrayRef<int64_t> fullShape = originalType.getShape();
   SmallVector<int64_t> trailingShape(fullShape.begin() + offset,
                                      fullShape.end());
   auto distributedShapeOrFailure =
       layout.computeDistributedShape(trailingShape);
   if (failed(distributedShapeOrFailure))
     return failure();

   SmallVector<int64_t> resultShape(fullShape.begin(),
                                    fullShape.begin() + offset);
   resultShape.append(distributedShapeOrFailure->begin(),
                      distributedShapeOrFailure->end());
   return VectorType::get(resultShape, originalType.getElementType());
 }

 std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
   const StringRef prefix("layout_operand_");
   unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
   return llvm::formatv("{0}{1}", prefix, idx).str();
 }

 std::string xegpu::getTemporaryLayoutName(const OpResult result) {
   const StringRef prefix = "layout_result_";
   return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str();
 }

 xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
   if (!value)
     return nullptr;

   if (auto tdescTy =
           dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
     return tdescTy.getLayoutAttr();

   if (auto result = dyn_cast<OpResult>(value)) {
     Operation *defOp = result.getDefiningOp();
     assert(defOp && "result must have a defining op");

     if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
       auto layout = anchorOp.getAnchorLayout();
       return layout;
     }

     std::string layoutName = getTemporaryLayoutName(result);
     if (defOp->hasAttr(layoutName)) {
       auto layout =
           defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
       return layout;
     }
   }

   if (auto arg = dyn_cast<BlockArgument>(value)) {
     auto *parentOp = arg.getOwner()->getParentOp();
     if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
       OpOperand *tiedInit = loop.getTiedLoopInit(arg);
       if (tiedInit)
         return getDistributeLayoutAttr(tiedInit->get());
     }
   }

   return nullptr;
 }
 xegpu::DistributeLayoutAttr
 xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   Operation *op = opr.getOwner();
   unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();

   if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
     if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
       if (idx == 0) {
         return dpasOp.getLayoutAAttr();
       } else if (idx == 1) {
         return dpasOp.getLayoutBAttr();
       } else if (idx == 2) {
         return dpasOp.getLayoutCdAttr();
       }
     }
     if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
       return convertOp.getInputLayoutAttr();
     }
     auto layout = anchorOp.getAnchorLayout();

     if (idx == 0)
       return layout;

     // For StoreNdOp and StoreMatrixOp,
     // the layout is valid for the first two operands: value and memref/tdesc.
     if (isa<xegpu::StoreNdOp, xegpu::StoreMatrixOp>(op) && (idx < 2))
       return layout;

     if (isa<xegpu::StoreScatterOp>(op)) {
       xegpu::StoreScatterOp store(op);
       int chunkSize = store.getChunkSize().value_or(1);
       if (layout && idx >= 2 && chunkSize > 1)
         return layout.dropDims(llvm::to_vector(
             llvm::seq<int64_t>(layout.getRank() - 1, layout.getRank())));
       return layout;
     }
     if (isa<xegpu::LoadGatherOp>(op)) {
       xegpu::LoadGatherOp load(op);
       int chunkSize = load.getChunkSize().value_or(1);
       if (layout && idx >= 1 && chunkSize > 1)
         return layout.dropDims(llvm::to_vector(
             llvm::seq<int64_t>(layout.getRank() - 1, layout.getRank())));
       return layout;
     }
   }

   std::string layoutName = xegpu::getTemporaryLayoutName(opr);
   if (op->hasAttr(layoutName)) {
     auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
     return layout;
   }

   return nullptr;
 }

 // Returns the permanent layout attribute for the given result if it's
 // available on the defining op. Otherwise returns the provided layout.
 xegpu::DistributeLayoutAttr
 maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
                          const OpResult &result, mlir::Operation *owner,
                          const std::string &name) {
   xegpu::DistributeLayoutAttr candidate = layout;

   if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
     if (auto perm = loadOp.getLayoutAttr())
       candidate = perm;
   }

   return candidate;
 }

 // Returns the permanent layout attribute for the given operand if it's
 // available on the defining op. Otherwise returns the provided layout.
 xegpu::DistributeLayoutAttr
 maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
                          const OpOperand &operand, mlir::Operation *owner,
                          const std::string &name) {
   xegpu::DistributeLayoutAttr candidate = layout;
   unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();

   if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
     if (idx == 0) {
       if (auto perm = storeOp.getLayoutAttr())
         candidate = perm;
     }
   }

   return candidate;
 }

 // TODO-LayoutRefactor: Remove this function after replacing use
 //  with setTemporaryLayout or setAnchorLayout
 void xegpu::setDistributeLayoutAttr(
     const mlir::OpResult &result,
     const mlir::xegpu::DistributeLayoutAttr layout) {
   Operation *owner = result.getOwner();

   if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
     if (anchorOp.getAnchorLayout() == layout)
       return;
     anchorOp.setAnchorLayout(layout);
     return;
   }

   std::string name = xegpu::getTemporaryLayoutName(result);
   if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
     return;
   }
   if (layout) {
     owner->setAttr(name, layout);
   }
 }

 // TODO-LayoutRefactor: Remove this function after replacing use
 //  with setTemporaryLayout or setAnchorLayout
 void xegpu::setDistributeLayoutAttr(const OpOperand &operand,
                                     const DistributeLayoutAttr layout) {
   Operation *owner = operand.getOwner();
   unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();

   if (!layout) {
     return;
   }
   if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
     if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
       if (idx == 0) {
         return dpasOp.setLayoutAAttr(layout);
       } else if (idx == 1) {
         return dpasOp.setLayoutBAttr(layout);
       } else if (idx == 2) {
         return dpasOp.setLayoutCdAttr(layout);
       }
     }
     if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
       return convertOp.setInputLayoutAttr(layout);
     }

     // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
     // the layout is valid for the first two operands: value and memref/tdesc.
     // For other operations, the layout applies to the first operand only.
     if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
             owner)) {
       if (idx < 2) {
         anchorOp.setAnchorLayout(layout);
       }
     } else {
       if (idx == 0) {
         anchorOp.setAnchorLayout(layout);
       }
     }
   }

   std::string name = xegpu::getTemporaryLayoutName(operand);
   if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
     return;
   }
   if (layout) {
     owner->setAttr(name, layout);
   }
 }

 template <typename T, typename>
 xegpu::DistributeLayoutAttr
 xegpu::getTemporaryLayout(const T &operandOrResult) {
   Operation *op = operandOrResult.getOwner();

   std::string layoutName = xegpu::getTemporaryLayoutName(operandOrResult);
   if (op->hasAttr(layoutName)) {
     auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
     return layout;
   }

   return nullptr;
 }

 template xegpu::DistributeLayoutAttr
 xegpu::getTemporaryLayout<mlir::OpResult>(const OpResult &result);
 template xegpu::DistributeLayoutAttr
 xegpu::getTemporaryLayout<mlir::OpOperand>(const OpOperand &operand);

 template <typename T, typename>
 void xegpu::setTemporaryLayout(const T &operandOrResult,
                                const xegpu::DistributeLayoutAttr layout) {
   Operation *owner = operandOrResult.getOwner();
   std::string name = xegpu::getTemporaryLayoutName(operandOrResult);
   if (owner->hasAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
     return;
   }
   if (layout) {
     owner->setAttr(name, layout);
   }
 }

 template void xegpu::setTemporaryLayout<mlir::OpResult>(
     const mlir::OpResult &result,
     const mlir::xegpu::DistributeLayoutAttr layout);

 template void xegpu::setTemporaryLayout<mlir::OpOperand>(
     const mlir::OpOperand &operand,
     const mlir::xegpu::DistributeLayoutAttr layout);

 SmallVector<Value>
 xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
                                         Value value, ArrayRef<int64_t> shape) {
   auto vecTy = dyn_cast<VectorType>(value.getType());
   if (!vecTy)
     return {value};

   ArrayRef<int64_t> srcShape = vecTy.getShape();
   if (!computeShapeRatio(srcShape, shape))
     return {value};

   int64_t srcShapeRank = srcShape.size();
   int64_t targetShapeRank = shape.size();

   SmallVector<int64_t> adjustedTargetShape(srcShape.size());
   int64_t rankDiff = srcShapeRank - targetShapeRank;
   std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff,
             1);
   llvm::copy(shape, adjustedTargetShape.begin() + rankDiff);

   SmallVector<Value> result;
   for (SmallVector<int64_t> offsets :
        StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
     SmallVector<int64_t> staticStrides(offsets.size(), 1);
     Value slice = vector::ExtractStridedSliceOp::create(
         builder, loc, value, offsets, adjustedTargetShape, staticStrides);

     // Reshape to remove leading unit dims if needed
     if (srcShapeRank > targetShapeRank) {
       auto targetTy = VectorType::get(shape, vecTy.getElementType());
       slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
     }
     result.push_back(slice);
   }

   return result;
 }

 Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc,
                                              ValueRange values,
                                              ArrayRef<int64_t> shape) {
   VectorType inputTy = dyn_cast<VectorType>(values[0].getType());
   assert(llvm::all_of(values.getTypes(),
                       [&](Type type) { return type == inputTy; }) &&
          "values must be of the same VectorType");

   Type elemTy = inputTy.getElementType();
   ArrayRef<int64_t> tileShape = inputTy.getShape();

   VectorType resultTy = VectorType::get(shape, elemTy);
   auto zeroAttr = builder.getZeroAttr(elemTy);
   Value result = arith::ConstantOp::create(
       builder, loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr));

   for (auto [src, offsets] :
        llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) {
     SmallVector<int64_t> staticStrides(tileShape.size(), 1);
     result = vector::InsertStridedSliceOp::create(builder, loc, src, result,
                                                   offsets, staticStrides);
   }
   return result;
 }

 void xegpu::doSCFStructuralTypeConversionWithTensorType(
     Operation *op, TypeConverter converter) {
   MLIRContext *context = op->getContext();

   auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,
                             Location loc) -> Value {
     return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
         .getResult(0);
   };

   { // convert VectorType to RankedTensorType for SCF Structural ops
     TypeConverter converter;
     converter.addConversion([](Type type) -> Type { return type; });
     converter.addConversion([](VectorType type) -> Type {
       return RankedTensorType::get(type.getShape(), type.getElementType());
     });
     converter.addSourceMaterialization(materializeCast);
     converter.addTargetMaterialization(materializeCast);

     mlir::ConversionTarget target(*context);
     target.addLegalOp<UnrealizedConversionCastOp>();

     mlir::RewritePatternSet patterns(context);
     scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
                                                          target);
     (void)mlir::applyPartialConversion(op, target, std::move(patterns));
   }

   { // propagate the layout attribute to RankedTensorType by checking
     // BuiltInUnrealizedCastOps
     // for VectorType to RankedTensorType cast.
     op->walk([](UnrealizedConversionCastOp castOp) {
       if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1)
         return WalkResult::skip();

       Value input = castOp.getInputs()[0];
       Value result = castOp.getResults()[0];
       auto inputTy = dyn_cast<VectorType>(input.getType());
       auto resultTy = dyn_cast<RankedTensorType>(result.getType());

       // Only look at ops casting from VectorType to RankedTensorType
       if (!inputTy || !resultTy)
         return WalkResult::skip();

       xegpu::DistributeLayoutAttr layout =
           xegpu::getDistributeLayoutAttr(input);
       if (!layout)
         return WalkResult::skip();

       RankedTensorType newTy = resultTy.cloneWithEncoding(layout);
       result.setType(newTy);

       // update the arguments if user is a LoopLike op.
       for (OpOperand &use : result.getUses()) {
         if (auto loop = dyn_cast<LoopLikeOpInterface>(use.getOwner())) {
           BlockArgument arg = loop.getTiedLoopRegionIterArg(&use);
           arg.setType(newTy);
         }
         // whileOp has two regions, the BlockArgument of the after region
         // is not exposed by LoopLikeOpInterface
         if (auto whileOp = dyn_cast<scf::WhileOp>(use.getOwner())) {
           unsigned idx = use.getOperandNumber();
           BlockArgument arg = whileOp.getAfterArguments()[idx];
           arg.setType(newTy);
         }
       }
       return WalkResult::advance();
     });

     // using yieldOp as anchor to update the result type of its ParentOp
     op->walk([](scf::YieldOp yieldOp) {
       Operation *parentOp = yieldOp->getParentOp();
       for (OpResult r : parentOp->getOpResults()) {
         unsigned idx = r.getResultNumber();
         Type resultTy = r.getType();
         Type yieldTy = yieldOp.getResults()[idx].getType();
         if (isa<RankedTensorType>(resultTy) && yieldTy != resultTy)
           r.setType(yieldTy);
       }
     });
   }

   { // perform the conversion from RankedTensorType to VectorType based on the
     // DistributeLayoutAttr

     // Handle the UnrealizedConversionCastOp introduced by the first step.
     // For vector->RankedTensorType, it will simply forward the inputs.
     // For RankedTensorType->vector, it will update the inputs with the
     // one from the adaptor.
     class UnrealizedConversionCastOpPattern
         : public OpConversionPattern<mlir::UnrealizedConversionCastOp> {
       using OpConversionPattern<
           mlir::UnrealizedConversionCastOp>::OpConversionPattern;

       mlir::LogicalResult
       matchAndRewrite(mlir::UnrealizedConversionCastOp op,
                       OneToNOpAdaptor adaptor,
                       ConversionPatternRewriter &rewriter) const override {
         auto inputs = op.getOperands();
         auto outputs = op.getOutputs();

         if (inputs.size() != 1 || outputs.size() != 1)
           return failure();

         auto inputTy = inputs[0].getType();
         auto outputTy = outputs[0].getType();

         if (isa<VectorType>(inputTy) && isa<RankedTensorType>(outputTy)) {
           rewriter.replaceOpWithMultiple(op, adaptor.getInputs());
           return success();
         }

         if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
           SmallVector<Value> values = xegpu::flattenValues(adaptor.getInputs());
           auto newOp = UnrealizedConversionCastOp::create(rewriter, op.getLoc(),
                                                           outputTy, values);
           rewriter.replaceOp(op, newOp);
           return success();
         }
         return failure();
       }
     };

     converter.addSourceMaterialization(materializeCast);
     converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type,
                                            ValueRange inputs, Location loc) {
       return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
           .getResults();
     });

     mlir::ConversionTarget target(*context);
     target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
         [](UnrealizedConversionCastOp op) {
           auto isTensorTy = [](Type type) {
             return isa<RankedTensorType>(type);
           };
           return llvm::none_of(op->getOperandTypes(), isTensorTy) &&
                  llvm::none_of(op->getResultTypes(), isTensorTy);
         });
     mlir::RewritePatternSet patterns(context);
     patterns.insert<UnrealizedConversionCastOpPattern>(context);
     scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
                                                          target);
     (void)mlir::applyPartialConversion(op, target, std::move(patterns));
   }
 }

 std::optional<std::string> xegpu::getChipStr(Operation *op) {
   auto gpuModuleOp = op->getParentOfType<gpu::GPUModuleOp>();

   if (!gpuModuleOp)
     return std::nullopt;

   auto targetAttrs = gpuModuleOp.getTargets();
   if (targetAttrs) {
     for (auto &attr : *targetAttrs) {
       auto xevmAttr = llvm::dyn_cast<xevm::XeVMTargetAttr>(attr);
       if (xevmAttr)
         return xevmAttr.getChip().str();
     }
   }

   return std::nullopt;
 }

 /// Generates element-wise addition ops of two arrays with same length.
 SmallVector<OpFoldResult> xegpu::addElementwise(OpBuilder &builder,
                                                 Location loc,
                                                 ArrayRef<OpFoldResult> lhs,
                                                 ArrayRef<OpFoldResult> rhs) {
   assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size");
   SmallVector<OpFoldResult> results;
   for (auto [l, r] : llvm::zip_equal(lhs, rhs)) {
     auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
     auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
     results.push_back(builder.createOrFold<arith::AddIOp>(loc, lval, rval));
   }
   return results;
 }

 /// Generates element-wise addition ops of two arrays with automatic alignment.
 /// When the input arrays have different sizes, the shorter array is
 /// right-aligned with the longer array, and the unmatched leading elements from
 /// the longer array are preserved unchanged. This is commonly used for offset
 /// computation where higher-dimensional offsets need to be added to
 /// lower-dimensional adjustments.
 ///
 /// Example:
 ///   lhs = [l1, l2, l3], rhs = [r1, r2]
 ///   Result: [11, l2+r1, l3+r2]
 SmallVector<OpFoldResult>
 xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
                            ArrayRef<OpFoldResult> lhs,
                            ArrayRef<OpFoldResult> rhs) {
   // ensure a is longer than b
   ArrayRef<OpFoldResult> a = lhs.size() >= rhs.size() ? lhs : rhs;
   ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
   SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
   a = a.slice(a.size() - b.size());
   results.append(addElementwise(builder, loc, a, b));
   return results;
 }

 template <typename T>
 int xegpu::getLargestDivisor(T dim, ArrayRef<T> candidates,
                              ArrayRef<T> candidateMultiples) {
   static_assert(std::is_integral<T>::value, "T must be an integer type");
   int largest = -1;
   SmallVector<T> multiples = {1};
   if (!candidateMultiples.empty())
     multiples =
         SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
   for (T candidate : candidates) {
     for (T multiple : multiples) {
       int value = static_cast<int>(candidate * multiple);
       if (value != 0 && dim % value == 0 && value > largest)
         largest = value;
     }
   }
   return largest;
 }

 Value xegpu::subgroupReduction(Location loc, OpBuilder &builder, Value input,
                                vector::CombiningKind kind, uint32_t size) {
   // First reduce on a single thread to get per lane reduction value.
   Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
   // Parallel reduction using butterfly shuffles.
   for (uint64_t i = 1; i < size; i <<= 1) {
     Value shuffled =
         gpu::ShuffleOp::create(builder, loc, laneVal, i, /**  width = **/ size,
                                /**  mode = **/ gpu::ShuffleMode::XOR)
             .getShuffleResult();
     laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
   }
   return laneVal;
 }

 Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
                                      TypedValue<VectorType> acc,
                                      vector::CombiningKind kind,
                                      int64_t reductionDim, Location loc,
                                      PatternRewriter &rewriter) {
   VectorType sourceType = src.getType();
   int64_t sourceRank = sourceType.getRank();
   // Expecting at least a 2D source vector. Leading dimensions (all except the
   // last two) must be unit.
   assert(sourceRank >= 2 && "expected at least a 2D source vector");
   for (int64_t i = 0; i < sourceRank - 2; ++i)
     assert(sourceType.getShape()[i] == 1 &&
            "expected leading dimensions to be unit");
   int64_t rowIdx = sourceRank - 2;
   int64_t columnIdx = sourceRank - 1;
   int64_t sourceH = sourceType.getShape()[rowIdx];
   int64_t sourceW = sourceType.getShape()[columnIdx];
   int nSlices = (reductionDim == rowIdx) ? sourceW : sourceH;
   // Create a constant vector to hold the result of the reduction.
   TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
   Value reductionResult = arith::ConstantOp::create(
       rewriter, loc, acc.getType(),
       DenseElementsAttr::get(acc.getType(), zeroAttr));
   // TODO: Remove these get/setTemporaryLayout calls after we deprecate the old
   // XeGPUSubgroupDistribute pass.
   auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
   auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
   // Reduction result should have the same layout as the accumulator.
   xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
   // For each slice of the source, extract the slice vector, do a reduction
   // and, insert the reduced value back to the result vector.
   int64_t accRank = acc.getType().getRank();
   for (int i = 0; i < nSlices; ++i) {
     // Build nD offsets, sizes, and strides. Leading unit dims get
     // offset=0, size=1. The last two dims are set based on reductionDim.
     SmallVector<int64_t> sliceOffsets(sourceRank, 0);
     SmallVector<int64_t> sliceSizes(sourceRank, 1);
     SmallVector<int64_t> strides(sourceRank, 1);
     if (reductionDim == columnIdx) {
       sliceOffsets[rowIdx] = i;
       sliceSizes[columnIdx] = sourceW;
     } else {
       sliceOffsets[columnIdx] = i;
       sliceSizes[rowIdx] = sourceH;
     }

     vector::ExtractStridedSliceOp extractOp =
         vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                               sliceSizes, strides);
     // Extract strided slice has the same layout as src.
     xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);

     int64_t nSliceElements = extractOp.getResult().getType().getNumElements();

     vector::ShapeCastOp slice = vector::ShapeCastOp::create(
         rewriter, loc,
         VectorType::get({nSliceElements}, sourceType.getElementType()),
         extractOp.getResult());

     // Shape cast output has the same layout as the accumulator. Shape cast
     // source has the same layout as the original reduction source.
     xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
     xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
     // Extract and reduction results in scalars, so no result layout is needed.
     // Build multi-dim index into acc (sourceRank-1 dims, i.e. source shape with
     // the reduction dim removed). Leading unit dims get index 0.
     SmallVector<int64_t> accIdx(accRank, 0);
     accIdx[accRank - 1] = i;
     Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, accIdx);
     Value reduction = vector::ReductionOp::create(
         rewriter, loc, kind, slice.getResult(), accExtract);
     reductionResult = vector::InsertOp::create(rewriter, loc, reduction,
                                                reductionResult, accIdx);
     // Insert op should have the same layout as the accumulator.
     xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
   }
   return reductionResult;
 }

 Value xegpu::lowerCrossLaneReductionToShuffles(
     TypedValue<VectorType> src, TypedValue<VectorType> acc,
     vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize,
     Location loc, PatternRewriter &rewriter) {
   VectorType sourceType = src.getType();
   int64_t sourceRank = sourceType.getRank();
   // Expecting at least a 2D source vector. Leading dimensions (all except the
   // last two) must be unit.
   assert(sourceRank >= 2 && "expected at least a 2D source vector");
   for (int64_t i = 0; i < sourceRank - 2; ++i)
     assert(sourceType.getShape()[i] == 1 &&
            "expected leading dimensions to be unit");
   int64_t rowIdx = sourceRank - 2;
   int64_t columnIdx = sourceRank - 1;
   int64_t sourceH = sourceType.getShape()[rowIdx];
   int64_t sourceW = sourceType.getShape()[columnIdx];

   // Create a constant vector to hold the result of the reduction.
   TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
   Value reductionResult = arith::ConstantOp::create(
       rewriter, loc, acc.getType(),
       DenseElementsAttr::get(acc.getType(), zeroAttr));

   // nSlices is the number of reduction operations needed to reduce the entire
   // source vector. For example, if reductionDim is the row dim, we are
   // reducing across rows, and each slice is a column. So the number of slices
   // is the number of columns, which is sourceW.
   int nSlices = (reductionDim == rowIdx) ? sourceW : sourceH;

   // For each slice of the source, extract the slice vector, do a reduction
   // and, insert the reduced value back to the result vector.
   int64_t accRank = acc.getType().getRank();
   for (int i = 0; i < nSlices; ++i) {
     // Build nD offsets, sizes, and strides. Leading unit dims get
     // offset=0, size=1. The last two dims are set based on reductionDim.
     SmallVector<int64_t> sliceOffsets(sourceRank, 0);
     SmallVector<int64_t> sliceSizes(sourceRank, 1);
     SmallVector<int64_t> strides(sourceRank, 1);
     if (reductionDim == columnIdx) {
       sliceOffsets[rowIdx] = i;
       sliceSizes[columnIdx] = sourceW;
     } else {
       sliceOffsets[columnIdx] = i;
       sliceSizes[rowIdx] = sourceH;
     }

     vector::ExtractStridedSliceOp extractOp =
         vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                               sliceSizes, strides);
     int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
     vector::ShapeCastOp slice = vector::ShapeCastOp::create(
         rewriter, loc,
         VectorType::get({nSliceElements}, sourceType.getElementType()),
         extractOp.getResult());

     SmallVector<int64_t> accIdx(accRank, 0);
     accIdx[accRank - 1] = i;
     Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, accIdx);
     Value fullReduce =
         xegpu::subgroupReduction(loc, rewriter, slice, kind, reductionSize);
     fullReduce =
         vector::makeArithReduction(rewriter, loc, kind, fullReduce, accExtract);
     reductionResult = vector::InsertOp::create(rewriter, loc, fullReduce,
                                                reductionResult, accIdx);
   }
   return reductionResult;
 }

 Value xegpu::createReductionNeutralValue(OpBuilder &builder, Location loc,
                                          Type type,
                                          vector::CombiningKind kind) {
   auto vecTy = dyn_cast<VectorType>(type);
   Type elemTy = vecTy ? vecTy.getElementType() : type;

   // Helper to create either a splat vector or scalar constant from an attr.
   auto makeConst = [&](Attribute scalarAttr) -> Value {
     if (vecTy)
       return arith::ConstantOp::create(
           builder, loc, vecTy, DenseElementsAttr::get(vecTy, scalarAttr));
     return arith::ConstantOp::create(builder, loc, cast<TypedAttr>(scalarAttr));
   };

   switch (kind) {
   case vector::CombiningKind::ADD:
   case vector::CombiningKind::XOR:
   case vector::CombiningKind::OR:
   case vector::CombiningKind::MAXUI:
     return makeConst(builder.getZeroAttr(elemTy));

   case vector::CombiningKind::MUL:
   case vector::CombiningKind::AND:
     return makeConst(builder.getOneAttr(elemTy));

   case vector::CombiningKind::MINSI:
     if (auto intTy = dyn_cast<IntegerType>(elemTy))
       return makeConst(builder.getIntegerAttr(
           elemTy, APInt::getSignedMaxValue(intTy.getWidth())));
     return nullptr;

   case vector::CombiningKind::MINUI:
     if (auto intTy = dyn_cast<IntegerType>(elemTy))
       return makeConst(
           builder.getIntegerAttr(elemTy, APInt::getMaxValue(intTy.getWidth())));
     return nullptr;

   case vector::CombiningKind::MAXSI:
     if (auto intTy = dyn_cast<IntegerType>(elemTy))
       return makeConst(builder.getIntegerAttr(
           elemTy, APInt::getSignedMinValue(intTy.getWidth())));
     return nullptr;

   case vector::CombiningKind::MINNUMF:
   case vector::CombiningKind::MINIMUMF:
     if (auto floatTy = dyn_cast<FloatType>(elemTy))
       return makeConst(builder.getFloatAttr(
           elemTy, APFloat::getInf(floatTy.getFloatSemantics())));
     return nullptr;

   case vector::CombiningKind::MAXNUMF:
   case vector::CombiningKind::MAXIMUMF:
     if (auto floatTy = dyn_cast<FloatType>(elemTy))
       return makeConst(builder.getFloatAttr(
           elemTy, APFloat::getInf(floatTy.getFloatSemantics(), true)));
     return nullptr;
   }
   return nullptr;
 }

 /// Explicit instantiations
 template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
                                            ArrayRef<int> candidateMultiples);
 template int
 xegpu::getLargestDivisor<unsigned>(unsigned dim, ArrayRef<unsigned> candidates,
                                    ArrayRef<unsigned> candidateMultiples);

 bool xegpu::requirePacked(const xegpu::DistributeLayoutAttr layout) {
   if (!layout)
     return false;
   auto laneData = layout.getEffectiveLaneDataAsInt();
   if (laneData.size() != 2)
     return false;
   return laneData[0] != 1;
 }

 bool xegpu::requireTranspose(const xegpu::DistributeLayoutAttr layout,
                              const xegpu::uArch::uArch *uArch) {
   // Return false for unsupported targets.
   // TODO: Add more support or move to target info.
   if (uArch->getName().equals_insensitive("pvc") &&
       uArch->getName().equals_insensitive("bmg"))
     return false;
   if (!layout)
     return false;
   auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
   if (laneLayout.size() != 2)
     return false;
   return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
 }

 // Check if dst shape is an expansion of src shape by inserting unit dimensions.
 // Returns true if all dimensions in src match corresponding dimensions in dst
 // (after skipping unit dimensions), and populates expandedUnitDims with the
 // indices of the unit dimensions in dst that were added (not present in src).
 // Example: src=[2,3], dst=[1,2,3,1] -> true, expandedUnitDims=[0,3]
 bool xegpu::matchUnitDimExpansion(ArrayRef<int64_t> src, ArrayRef<int64_t> dst,
                                   SmallVector<int64_t> &expandedUnitDims) {
   // All unit dimensions in dst that don't appear in src are the expanded
   // unit dimensions
   size_t srcIdx = 0;
   for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx)
     if (srcIdx < src.size() && src[srcIdx] == dst[dstIdx])
       srcIdx++;
     else if (dst[dstIdx] == 1)
       expandedUnitDims.push_back(dstIdx);
     else
       return false;
   return srcIdx == src.size();
 }

 // Checks if dst shape is an expansion of src shape where each dimension in src
 // is split into one or more consecutive dimensions in dst whose product equals
 // the original dimension. Populates splitDimGroups with groups of dst indices
 // that correspond to each src dimension. Example: src=[6,4], dst=[2,3,2,2] ->
 // true
 bool xegpu::matchSplitDimExpansion(
     ArrayRef<int64_t> src, ArrayRef<int64_t> dst,
     SmallVector<SmallVector<int64_t>> &splitDimGroups) {
   // each dim in src can be mapped to one or more dims in dst whose product
   // equals to the src dim
   size_t srcIdx = 0;
   int64_t accumulatedSize = 1;
   SmallVector<int64_t> currentDstDims;

   splitDimGroups.clear();
   for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx) {
     if (srcIdx >= src.size())
       return false;
     accumulatedSize *= dst[dstIdx];
     currentDstDims.push_back(dstIdx);

     if (accumulatedSize == src[srcIdx]) {
       // Record the mapping: srcIdx -> currentDstDims
       splitDimGroups.push_back(currentDstDims);
       // move to next src dim
       srcIdx++;
       accumulatedSize = 1;
       currentDstDims.clear();
     } else if (accumulatedSize > src[srcIdx]) {
       return false;
     }
   }
   return srcIdx == src.size();
 }