| //===- XeGPUDialect.cpp - MLIR XeGPU dialect implementation -----*- C++ -*-===// | 
 | // | 
 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
 | // See https://llvm.org/LICENSE.txt for license information. | 
 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 |  | 
 | #include "mlir/Dialect/Utils/IndexingUtils.h" | 
 | #include "mlir/Dialect/XeGPU/IR/XeGPU.h" | 
 | #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" | 
 | #include "mlir/IR/Builders.h" | 
 | #include "mlir/IR/DialectImplementation.h" | 
 | #include "llvm/ADT/TypeSwitch.h" | 
 | #include <numeric> | 
 |  | 
 | using std::optional; | 
 |  | 
 | namespace mlir { | 
 | namespace xegpu { | 
 |  | 
 | void XeGPUDialect::initialize() { | 
 |   addTypes< | 
 | #define GET_TYPEDEF_LIST | 
 | #include <mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc> | 
 |       >(); | 
 |   addOperations< | 
 | #define GET_OP_LIST | 
 | #include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc> | 
 |       >(); | 
 |   addAttributes< | 
 | #define GET_ATTRDEF_LIST | 
 | #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc> | 
 |       >(); | 
 | } | 
 |  | 
 | // Checks if the given shape can be evenly distributed based on the layout | 
 | // and data factors provided by the LayoutAttr. | 
 | bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape, | 
 |                                          xegpu::LayoutAttr attr) { | 
 |   assert(attr && "Layout attribute is missing."); | 
 |  | 
 |   // Checks whether the given shape can be evenly distributed using the | 
 |   // specified layout and data attributes. If successful, it returns the work | 
 |   // size for each compute unit; otherwise, it returns `std::nullopt`. The work | 
 |   // size per compute unit is calculated as follows: | 
 |   //   - If `data` is null: newShape[i] = shape[i] / layout[i] | 
 |   //   - If `data` is not null: newShape[i] = data[i] | 
 |   // When round-robin distribution (`rr`) is enabled, `shape[i]` can be | 
 |   // smaller than `layout[i] * data[i]`, allowing multiple compute units to | 
 |   // share the data. | 
 |   auto tryDistribute = [&](llvm::ArrayRef<int64_t> shape, | 
 |                            DenseI32ArrayAttr layout, DenseI32ArrayAttr data, | 
 |                            bool rr = true) -> optional<SmallVector<int64_t>> { | 
 |     llvm::SmallVector<int64_t> newShape(shape); | 
 |     if (layout) { | 
 |       auto vec = llvm::to_vector_of<int64_t>(layout.asArrayRef()); | 
 |       if (vec.size() != shape.size()) | 
 |         return std::nullopt; | 
 |       auto ratio = computeShapeRatio(shape, vec); | 
 |       if (!ratio.has_value()) | 
 |         return std::nullopt; | 
 |       newShape = ratio.value(); | 
 |     } | 
 |  | 
 |     if (data) { | 
 |       auto vec = llvm::to_vector_of<int64_t>(data.asArrayRef()); | 
 |       if (vec.size() != shape.size()) | 
 |         return std::nullopt; | 
 |       auto ratio = computeShapeRatio(newShape, vec); | 
 |       if (!ratio.has_value() && rr) | 
 |         ratio = computeShapeRatio(vec, newShape); | 
 |       if (!ratio.has_value()) | 
 |         return std::nullopt; | 
 |  | 
 |       // if data is not null, we always return it for next phase. | 
 |       newShape = vec; | 
 |     } | 
 |     return newShape; | 
 |   }; | 
 |  | 
 |   // check the sgLayout and sgData | 
 |   auto maybeSgShape = | 
 |       tryDistribute(shape, attr.getSgLayout(), attr.getSgData()); | 
 |   if (!maybeSgShape) | 
 |     return false; | 
 |   auto sgShape = maybeSgShape.value(); | 
 |  | 
 |   // check InstData, it neither have layout nor need round-robin | 
 |   auto maybeInstShape = | 
 |       tryDistribute(sgShape, nullptr, attr.getInstData(), false); | 
 |   if (!maybeInstShape) | 
 |     return false; | 
 |   auto instShape = maybeInstShape.value(); | 
 |  | 
 |   // check LaneLayout and LaneData | 
 |   auto maybeLaneShape = | 
 |       tryDistribute(instShape, attr.getLaneLayout(), attr.getLaneData(), false); | 
 |   return maybeLaneShape.has_value(); | 
 | } | 
 |  | 
 | //===----------------------------------------------------------------------===// | 
 | // XeGPU_BlockTensorDescAttr | 
 | //===----------------------------------------------------------------------===// | 
 | BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context, | 
 |                                              xegpu::MemorySpace memory_space, | 
 |                                              int array_length, | 
 |                                              bool boundary_check) { | 
 |   auto scopeAttr = MemorySpaceAttr::get(context, memory_space); | 
 |   auto lengthAttr = | 
 |       IntegerAttr::get(IntegerType::get(context, 64), array_length); | 
 |   auto boundaryAttr = BoolAttr::get(context, boundary_check); | 
 |   return Base::get(context, scopeAttr, lengthAttr, boundaryAttr); | 
 | } | 
 |  | 
 | //===----------------------------------------------------------------------===// | 
 | // XeGPU_ScatterTensorDescAttr | 
 | //===----------------------------------------------------------------------===// | 
 | ScatterTensorDescAttr | 
 | ScatterTensorDescAttr::get(mlir::MLIRContext *context, | 
 |                            xegpu::MemorySpace memory_space, int chunk_size) { | 
 |   auto scopeAttr = MemorySpaceAttr::get(context, memory_space); | 
 |   auto chunkSizeAttr = | 
 |       IntegerAttr::get(IntegerType::get(context, 64), chunk_size); | 
 |   return Base::get(context, scopeAttr, chunkSizeAttr); | 
 | } | 
 |  | 
 | LogicalResult ScatterTensorDescAttr::verify( | 
 |     llvm::function_ref<mlir::InFlightDiagnostic()> emitError, | 
 |     MemorySpaceAttr memory_space, IntegerAttr chunk_size) { | 
 |   int64_t chunkSize = chunk_size.getInt(); | 
 |   SmallVector<int64_t> supportedChunkSizes = {1,  2,  3,  4,   8, | 
 |                                               16, 32, 64, 128, 256}; | 
 |   if (!llvm::is_contained(supportedChunkSizes, chunkSize)) | 
 |     return emitError() << "invalid chunk size"; | 
 |  | 
 |   return success(); | 
 | } | 
 |  | 
 | //===----------------------------------------------------------------------===// | 
 | // XeGPU_LayoutAttr | 
 | //===----------------------------------------------------------------------===// | 
 | LogicalResult | 
 | LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError, | 
 |                    DenseI32ArrayAttr sg_layout, DenseI32ArrayAttr sg_data, | 
 |                    DenseI32ArrayAttr inst_data, DenseI32ArrayAttr lane_layout, | 
 |                    DenseI32ArrayAttr lane_data, DenseI32ArrayAttr order) { | 
 |  | 
 |   // A valid layout must include at least one of sg_layout and lane_layout. | 
 |   // sg_layout is essential for Workgroup layout, while lane_layout is | 
 |   // required for Subgroup layout. | 
 |   if (!sg_layout && !inst_data && !lane_layout) { | 
 |     return emitError() | 
 |            << "expected at least one of sg_layout, inst_data or lane_layout"; | 
 |   } | 
 |  | 
 |   // generate code to check sg_laout, inst_data and lane_layout having the same | 
 |   // rank if they are not null. | 
 |  | 
 |   if (sg_layout && inst_data && sg_layout.size() != inst_data.size()) { | 
 |     return emitError() | 
 |            << "expected sg_layout and inst_data to have the same rank"; | 
 |   } | 
 |  | 
 |   if (sg_layout && lane_layout && sg_layout.size() != lane_layout.size()) { | 
 |     return emitError() | 
 |            << "expected sg_layout and lane_layout to have the same rank"; | 
 |   } | 
 |  | 
 |   if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) { | 
 |     return emitError() | 
 |            << "expected inst_data and lane_layout to have the same rank"; | 
 |   } | 
 |  | 
 |   // sg_data is optional for Workgroup layout, but its presence requires | 
 |   // sg_layout. | 
 |   if (sg_data) { | 
 |     if (!sg_layout) | 
 |       return emitError() << "expected sg_layout being used with sg_data"; | 
 |     if (sg_data.size() != sg_layout.size()) | 
 |       return emitError() | 
 |              << "expected sg_data and sg_layout to have the same rank"; | 
 |   } | 
 |  | 
 |   // lane_data is optional for Subgroup layout, but its presence requires | 
 |   // lane_layout. | 
 |   if (lane_data) { | 
 |     if (!lane_layout) | 
 |       return emitError() << "expected lane_layout being used with lane_data"; | 
 |     if (lane_data.size() != lane_layout.size()) | 
 |       return emitError() | 
 |              << "expected lane_data and lane_layout to have the same rank"; | 
 |   } | 
 |  | 
 |   if (order) { | 
 |     if (!sg_layout && !lane_layout) | 
 |       return emitError() | 
 |              << "expected sg_layout/lane_layout being used with order"; | 
 |  | 
 |     if (sg_layout && order.size() != sg_layout.size()) | 
 |       return emitError() | 
 |              << "expected order and sg_layout to have the same rank"; | 
 |  | 
 |     if (lane_layout && order.size() != lane_layout.size()) | 
 |       return emitError() | 
 |              << "expected order and lane_layout to have the same rank"; | 
 |   } | 
 |  | 
 |   return success(); | 
 | } | 
 |  | 
 | //===----------------------------------------------------------------------===// | 
 | // XeGPU_TensorDescType | 
 | //===----------------------------------------------------------------------===// | 
 |  | 
 | mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { | 
 |   llvm::SmallVector<int64_t> shape; | 
 |   mlir::Type elementType; | 
 |   mlir::FailureOr<mlir::Attribute> encoding; | 
 |   mlir::FailureOr<mlir::Attribute> layout; | 
 |  | 
 |   // Parse literal '<' | 
 |   if (parser.parseLess()) | 
 |     return {}; | 
 |  | 
 |   auto shapeLoc = parser.getCurrentLocation(); | 
 |   if (mlir::failed(parser.parseDimensionList(shape))) { | 
 |     parser.emitError(shapeLoc, "failed to parse parameter 'shape'"); | 
 |     return {}; | 
 |   } | 
 |  | 
 |   auto elemTypeLoc = parser.getCurrentLocation(); | 
 |   if (mlir::failed(parser.parseType(elementType))) { | 
 |     parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'"); | 
 |     return {}; | 
 |   } | 
 |  | 
 |   // parse optional attributes | 
 |   while (mlir::succeeded(parser.parseOptionalComma())) { | 
 |     mlir::Attribute attr; | 
 |     ParseResult res = parser.parseAttribute(attr); | 
 |     if (mlir::succeeded(res)) { | 
 |       if (mlir::isa<LayoutAttr>(attr)) { | 
 |         layout = attr; | 
 |         continue; | 
 |       } | 
 |       if (mlir::isa<BlockTensorDescAttr, ScatterTensorDescAttr>(attr)) { | 
 |         encoding = attr; | 
 |         continue; | 
 |       } | 
 |     } | 
 |     return {}; | 
 |   } | 
 |  | 
 |   // Parse literal '>' | 
 |   if (parser.parseGreater()) | 
 |     return {}; | 
 |  | 
 |   return TensorDescType::getChecked( | 
 |       [&]() { return parser.emitError(parser.getNameLoc()); }, | 
 |       parser.getContext(), shape, elementType, | 
 |       encoding.value_or(mlir::Attribute()), layout.value_or(mlir::Attribute())); | 
 | } | 
 |  | 
 | void TensorDescType::print(::mlir::AsmPrinter &printer) const { | 
 |   printer << "<"; | 
 |  | 
 |   auto shape = getShape(); | 
 |   for (int64_t dim : shape) { | 
 |     if (mlir::ShapedType::isDynamic(dim)) | 
 |       printer << '?'; | 
 |     else | 
 |       printer << dim; | 
 |     printer << 'x'; | 
 |   } | 
 |  | 
 |   printer << getElementType(); | 
 |  | 
 |   if (auto encoding = getEncoding()) | 
 |     printer << ", " << encoding; | 
 |  | 
 |   if (auto layout = getLayout()) | 
 |     printer << ", " << layout; | 
 |  | 
 |   printer << ">"; | 
 | } | 
 |  | 
 | TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape, | 
 |                                    mlir::Type elementType, int array_length, | 
 |                                    bool boundary_check, | 
 |                                    MemorySpace memory_space, | 
 |                                    mlir::Attribute layout) { | 
 |   auto context = elementType.getContext(); | 
 |   auto attr = BlockTensorDescAttr::get(context, memory_space, array_length, | 
 |                                        boundary_check); | 
 |   return Base::get(context, shape, elementType, attr, layout); | 
 | } | 
 |  | 
 | TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape, | 
 |                                    mlir::Type elementType, int chunk_size, | 
 |                                    MemorySpace memory_space, | 
 |                                    mlir::Attribute layout) { | 
 |   auto context = elementType.getContext(); | 
 |   auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size); | 
 |   return Base::get(context, shape, elementType, attr, layout); | 
 | } | 
 |  | 
 | LogicalResult TensorDescType::verify( | 
 |     llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, | 
 |     llvm::ArrayRef<int64_t> shape, mlir::Type elementType, | 
 |     mlir::Attribute encoding, mlir::Attribute layout) { | 
 |   size_t rank = shape.size(); | 
 |   if (rank != 1 && rank != 2) | 
 |     return emitError() << "expected 1D or 2D tensor"; | 
 |  | 
 |   auto blockAttr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding); | 
 |   if (blockAttr) { | 
 |     MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace(); | 
 |     if (rank == 2 && memorySpaceAttr && | 
 |         memorySpaceAttr.getValue() == MemorySpace::SLM) | 
 |       return emitError() << "SLM is not supported for 2D block tensor"; | 
 |   } | 
 |  | 
 |   // for gather and scatter ops, Low-precision types are packed in 32-bit units. | 
 |   unsigned bitWidth = elementType.getIntOrFloatBitWidth(); | 
 |   int chunkAlignmentFactor = | 
 |       bitWidth < targetinfo::packedSizeInBitsForGatherScatter | 
 |           ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth | 
 |           : 1; | 
 |   auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding); | 
 |   if (scatterAttr) { | 
 |     // Expected tensor ranks for scattered data: | 
 |     //   - 1D tensor for fully non-contiguous elements (chunk size == 1) | 
 |     //   - 2D tensor for scattered blocks (chunk size > 1) | 
 |     unsigned chunkSize = scatterAttr.getChunkSize().getInt(); | 
 |     if (rank == 1 && chunkSize != 1) | 
 |       return emitError() << "expected non-contiguous elements for 1D tensor"; | 
 |     if (rank == 2 && chunkSize < 2) | 
 |       return emitError() << "expected chunk blocks for 2D tensor"; | 
 |     // If chunk size > 1, the second dimension of the tensor shape must be | 
 |     // equal to chunk size and it must be a multiple of the packing factor. | 
 |     if (chunkSize > 1) { | 
 |       if (shape.back() != chunkSize) | 
 |         return emitError() << "expected tensor shape[1] to match chunk size"; | 
 |       if (shape.back() % chunkAlignmentFactor != 0) | 
 |         return emitError() << "expected tensor shape[1] to be a multiple of " | 
 |                               "chunk alignment factor " | 
 |                            << chunkAlignmentFactor; | 
 |     } | 
 |   } | 
 |  | 
 |   auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout); | 
 |   if (layoutAttr) { | 
 |     if (rank != (size_t)layoutAttr.getRank()) | 
 |       return emitError() << "expected layout rank to match tensor rank"; | 
 |  | 
 |     auto laneData = layoutAttr.getLaneData(); | 
 |     if (scatterAttr && laneData) { | 
 |       // Validate subgroup mapping rules for scattered tensors. | 
 |       // A work-item's slice of the tensor with shape [sg_size] or | 
 |       // [sg_size, chunk_size] will be [1] or [1, 32/element_ty_bit_width] | 
 |       // respectively, the mapping should reflect that. This is because each | 
 |       // work item access data in 32 bit granularity. | 
 |  | 
 |       if (rank > 1 && laneData[0] != 1) | 
 |         return emitError() | 
 |                << "cannot map over non-contiguous scattered row elements"; | 
 |       if (laneData[rank - 1] != chunkAlignmentFactor) | 
 |         return emitError() << "work item data mapping must match the number of " | 
 |                               "contiguous elements"; | 
 |     } | 
 |  | 
 |     if (!XeGPUDialect::isEvenlyDistributable(shape, layoutAttr)) { | 
 |       std::string shapeStr; | 
 |       llvm::raw_string_ostream stream(shapeStr); | 
 |       llvm::interleaveComma(shape, stream); | 
 |       return emitError() << "cannot distribute [" << shapeStr << "] using " | 
 |                          << layoutAttr; | 
 |     } | 
 |   } | 
 |   return success(); | 
 | } | 
 |  | 
 | } // namespace xegpu | 
 | } // namespace mlir | 
 |  | 
 | #include <mlir/Dialect/XeGPU/IR/XeGPUDialect.cpp.inc> | 
 | #define GET_ATTRDEF_CLASSES | 
 | #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc> | 
 | #define GET_TYPEDEF_CLASSES | 
 | #include <mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc> |