blob: 7ef61de190b4c9917a151dec9944c08e25c1704c [file] [log] [blame]
//===- XeGPUDialect.cpp - MLIR XeGPU dialect implementation -----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/DialectImplementation.h"
#include "llvm/ADT/TypeSwitch.h"
#include <numeric>
using std::optional;
namespace mlir {
namespace xegpu {
void XeGPUDialect::initialize() {
addTypes<
#define GET_TYPEDEF_LIST
#include <mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc>
>();
addOperations<
#define GET_OP_LIST
#include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc>
>();
addAttributes<
#define GET_ATTRDEF_LIST
#include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc>
>();
}
// Checks if the given shape can be evenly distributed based on the layout
// and data factors provided by the LayoutAttr.
bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape,
xegpu::LayoutAttr attr) {
assert(attr && "Layout attribute is missing.");
// Checks whether the given shape can be evenly distributed using the
// specified layout and data attributes. If successful, it returns the work
// size for each compute unit; otherwise, it returns `std::nullopt`. The work
// size per compute unit is calculated as follows:
// - If `data` is null: newShape[i] = shape[i] / layout[i]
// - If `data` is not null: newShape[i] = data[i]
// When round-robin distribution (`rr`) is enabled, `shape[i]` can be
// smaller than `layout[i] * data[i]`, allowing multiple compute units to
// share the data.
auto tryDistribute = [&](llvm::ArrayRef<int64_t> shape,
DenseI32ArrayAttr layout, DenseI32ArrayAttr data,
bool rr = true) -> optional<SmallVector<int64_t>> {
llvm::SmallVector<int64_t> newShape(shape);
if (layout) {
auto vec = llvm::to_vector_of<int64_t>(layout.asArrayRef());
if (vec.size() != shape.size())
return std::nullopt;
auto ratio = computeShapeRatio(shape, vec);
if (!ratio.has_value())
return std::nullopt;
newShape = ratio.value();
}
if (data) {
auto vec = llvm::to_vector_of<int64_t>(data.asArrayRef());
if (vec.size() != shape.size())
return std::nullopt;
auto ratio = computeShapeRatio(newShape, vec);
if (!ratio.has_value() && rr)
ratio = computeShapeRatio(vec, newShape);
if (!ratio.has_value())
return std::nullopt;
// if data is not null, we always return it for next phase.
newShape = vec;
}
return newShape;
};
// check the sgLayout and sgData
auto maybeSgShape =
tryDistribute(shape, attr.getSgLayout(), attr.getSgData());
if (!maybeSgShape)
return false;
auto sgShape = maybeSgShape.value();
// check InstData, it neither have layout nor need round-robin
auto maybeInstShape =
tryDistribute(sgShape, nullptr, attr.getInstData(), false);
if (!maybeInstShape)
return false;
auto instShape = maybeInstShape.value();
// check LaneLayout and LaneData
auto maybeLaneShape =
tryDistribute(instShape, attr.getLaneLayout(), attr.getLaneData(), false);
return maybeLaneShape.has_value();
}
//===----------------------------------------------------------------------===//
// XeGPU_BlockTensorDescAttr
//===----------------------------------------------------------------------===//
BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context,
xegpu::MemorySpace memory_space,
int array_length,
bool boundary_check) {
auto scopeAttr = MemorySpaceAttr::get(context, memory_space);
auto lengthAttr =
IntegerAttr::get(IntegerType::get(context, 64), array_length);
auto boundaryAttr = BoolAttr::get(context, boundary_check);
return Base::get(context, scopeAttr, lengthAttr, boundaryAttr);
}
//===----------------------------------------------------------------------===//
// XeGPU_ScatterTensorDescAttr
//===----------------------------------------------------------------------===//
ScatterTensorDescAttr
ScatterTensorDescAttr::get(mlir::MLIRContext *context,
xegpu::MemorySpace memory_space, int chunk_size) {
auto scopeAttr = MemorySpaceAttr::get(context, memory_space);
auto chunkSizeAttr =
IntegerAttr::get(IntegerType::get(context, 64), chunk_size);
return Base::get(context, scopeAttr, chunkSizeAttr);
}
LogicalResult ScatterTensorDescAttr::verify(
llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
MemorySpaceAttr memory_space, IntegerAttr chunk_size) {
int64_t chunkSize = chunk_size.getInt();
SmallVector<int64_t> supportedChunkSizes = {1, 2, 3, 4, 8,
16, 32, 64, 128, 256};
if (!llvm::is_contained(supportedChunkSizes, chunkSize))
return emitError() << "invalid chunk size";
return success();
}
//===----------------------------------------------------------------------===//
// XeGPU_LayoutAttr
//===----------------------------------------------------------------------===//
LogicalResult
LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
DenseI32ArrayAttr sg_layout, DenseI32ArrayAttr sg_data,
DenseI32ArrayAttr inst_data, DenseI32ArrayAttr lane_layout,
DenseI32ArrayAttr lane_data, DenseI32ArrayAttr order) {
// A valid layout must include at least one of sg_layout and lane_layout.
// sg_layout is essential for Workgroup layout, while lane_layout is
// required for Subgroup layout.
if (!sg_layout && !inst_data && !lane_layout) {
return emitError()
<< "expected at least one of sg_layout, inst_data or lane_layout";
}
// generate code to check sg_laout, inst_data and lane_layout having the same
// rank if they are not null.
if (sg_layout && inst_data && sg_layout.size() != inst_data.size()) {
return emitError()
<< "expected sg_layout and inst_data to have the same rank";
}
if (sg_layout && lane_layout && sg_layout.size() != lane_layout.size()) {
return emitError()
<< "expected sg_layout and lane_layout to have the same rank";
}
if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) {
return emitError()
<< "expected inst_data and lane_layout to have the same rank";
}
// sg_data is optional for Workgroup layout, but its presence requires
// sg_layout.
if (sg_data) {
if (!sg_layout)
return emitError() << "expected sg_layout being used with sg_data";
if (sg_data.size() != sg_layout.size())
return emitError()
<< "expected sg_data and sg_layout to have the same rank";
}
// lane_data is optional for Subgroup layout, but its presence requires
// lane_layout.
if (lane_data) {
if (!lane_layout)
return emitError() << "expected lane_layout being used with lane_data";
if (lane_data.size() != lane_layout.size())
return emitError()
<< "expected lane_data and lane_layout to have the same rank";
}
if (order) {
if (!sg_layout && !lane_layout)
return emitError()
<< "expected sg_layout/lane_layout being used with order";
if (sg_layout && order.size() != sg_layout.size())
return emitError()
<< "expected order and sg_layout to have the same rank";
if (lane_layout && order.size() != lane_layout.size())
return emitError()
<< "expected order and lane_layout to have the same rank";
}
return success();
}
//===----------------------------------------------------------------------===//
// XeGPU_TensorDescType
//===----------------------------------------------------------------------===//
mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
llvm::SmallVector<int64_t> shape;
mlir::Type elementType;
mlir::FailureOr<mlir::Attribute> encoding;
mlir::FailureOr<mlir::Attribute> layout;
// Parse literal '<'
if (parser.parseLess())
return {};
auto shapeLoc = parser.getCurrentLocation();
if (mlir::failed(parser.parseDimensionList(shape))) {
parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
return {};
}
auto elemTypeLoc = parser.getCurrentLocation();
if (mlir::failed(parser.parseType(elementType))) {
parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
return {};
}
// parse optional attributes
while (mlir::succeeded(parser.parseOptionalComma())) {
mlir::Attribute attr;
ParseResult res = parser.parseAttribute(attr);
if (mlir::succeeded(res)) {
if (mlir::isa<LayoutAttr>(attr)) {
layout = attr;
continue;
}
if (mlir::isa<BlockTensorDescAttr, ScatterTensorDescAttr>(attr)) {
encoding = attr;
continue;
}
}
return {};
}
// Parse literal '>'
if (parser.parseGreater())
return {};
return TensorDescType::getChecked(
[&]() { return parser.emitError(parser.getNameLoc()); },
parser.getContext(), shape, elementType,
encoding.value_or(mlir::Attribute()), layout.value_or(mlir::Attribute()));
}
void TensorDescType::print(::mlir::AsmPrinter &printer) const {
printer << "<";
auto shape = getShape();
for (int64_t dim : shape) {
if (mlir::ShapedType::isDynamic(dim))
printer << '?';
else
printer << dim;
printer << 'x';
}
printer << getElementType();
if (auto encoding = getEncoding())
printer << ", " << encoding;
if (auto layout = getLayout())
printer << ", " << layout;
printer << ">";
}
TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
mlir::Type elementType, int array_length,
bool boundary_check,
MemorySpace memory_space,
mlir::Attribute layout) {
auto context = elementType.getContext();
auto attr = BlockTensorDescAttr::get(context, memory_space, array_length,
boundary_check);
return Base::get(context, shape, elementType, attr, layout);
}
TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
mlir::Type elementType, int chunk_size,
MemorySpace memory_space,
mlir::Attribute layout) {
auto context = elementType.getContext();
auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size);
return Base::get(context, shape, elementType, attr, layout);
}
LogicalResult TensorDescType::verify(
llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
mlir::Attribute encoding, mlir::Attribute layout) {
size_t rank = shape.size();
if (rank != 1 && rank != 2)
return emitError() << "expected 1D or 2D tensor";
auto blockAttr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding);
if (blockAttr) {
MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace();
if (rank == 2 && memorySpaceAttr &&
memorySpaceAttr.getValue() == MemorySpace::SLM)
return emitError() << "SLM is not supported for 2D block tensor";
}
// for gather and scatter ops, Low-precision types are packed in 32-bit units.
unsigned bitWidth = elementType.getIntOrFloatBitWidth();
int chunkAlignmentFactor =
bitWidth < targetinfo::packedSizeInBitsForGatherScatter
? targetinfo::packedSizeInBitsForGatherScatter / bitWidth
: 1;
auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding);
if (scatterAttr) {
// Expected tensor ranks for scattered data:
// - 1D tensor for fully non-contiguous elements (chunk size == 1)
// - 2D tensor for scattered blocks (chunk size > 1)
unsigned chunkSize = scatterAttr.getChunkSize().getInt();
if (rank == 1 && chunkSize != 1)
return emitError() << "expected non-contiguous elements for 1D tensor";
if (rank == 2 && chunkSize < 2)
return emitError() << "expected chunk blocks for 2D tensor";
// If chunk size > 1, the second dimension of the tensor shape must be
// equal to chunk size and it must be a multiple of the packing factor.
if (chunkSize > 1) {
if (shape.back() != chunkSize)
return emitError() << "expected tensor shape[1] to match chunk size";
if (shape.back() % chunkAlignmentFactor != 0)
return emitError() << "expected tensor shape[1] to be a multiple of "
"chunk alignment factor "
<< chunkAlignmentFactor;
}
}
auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout);
if (layoutAttr) {
if (rank != (size_t)layoutAttr.getRank())
return emitError() << "expected layout rank to match tensor rank";
auto laneData = layoutAttr.getLaneData();
if (scatterAttr && laneData) {
// Validate subgroup mapping rules for scattered tensors.
// A work-item's slice of the tensor with shape [sg_size] or
// [sg_size, chunk_size] will be [1] or [1, 32/element_ty_bit_width]
// respectively, the mapping should reflect that. This is because each
// work item access data in 32 bit granularity.
if (rank > 1 && laneData[0] != 1)
return emitError()
<< "cannot map over non-contiguous scattered row elements";
if (laneData[rank - 1] != chunkAlignmentFactor)
return emitError() << "work item data mapping must match the number of "
"contiguous elements";
}
if (!XeGPUDialect::isEvenlyDistributable(shape, layoutAttr)) {
std::string shapeStr;
llvm::raw_string_ostream stream(shapeStr);
llvm::interleaveComma(shape, stream);
return emitError() << "cannot distribute [" << shapeStr << "] using "
<< layoutAttr;
}
}
return success();
}
} // namespace xegpu
} // namespace mlir
#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.cpp.inc>
#define GET_ATTRDEF_CLASSES
#include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc>
#define GET_TYPEDEF_CLASSES
#include <mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc>