mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp - llvm-project - Git at Google

 //===- XeGPUDialect.cpp - MLIR XeGPU dialect implementation -----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include <numeric>

 using std::optional;

 namespace mlir {
 namespace xegpu {

 void XeGPUDialect::initialize() {
   addTypes<
 #define GET_TYPEDEF_LIST
 #include <mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc>
       >();
   addOperations<
 #define GET_OP_LIST
 #include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc>
       >();
   addAttributes<
 #define GET_ATTRDEF_LIST
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc>
       >();
 }

 // Checks if the given shape can be evenly distributed based on the layout
 // and data factors provided by the LayoutAttr.
 bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape,
                                          xegpu::LayoutAttr attr) {
   assert(attr && "Layout attribute is missing.");

   // Checks whether the given shape can be evenly distributed using the
   // specified layout and data attributes. If successful, it returns the work
   // size for each compute unit; otherwise, it returns `std::nullopt`. The work
   // size per compute unit is calculated as follows:
   //   - If `data` is null: newShape[i] = shape[i] / layout[i]
   //   - If `data` is not null: newShape[i] = data[i]
   // When round-robin distribution (`rr`) is enabled, `shape[i]` can be
   // smaller than `layout[i] * data[i]`, allowing multiple compute units to
   // share the data.
   auto tryDistribute = [&](llvm::ArrayRef<int64_t> shape,
                            DenseI32ArrayAttr layout, DenseI32ArrayAttr data,
                            bool rr = true) -> optional<SmallVector<int64_t>> {
     llvm::SmallVector<int64_t> newShape(shape);
     if (layout) {
       auto vec = llvm::to_vector_of<int64_t>(layout.asArrayRef());
       if (vec.size() != shape.size())
         return std::nullopt;
       auto ratio = computeShapeRatio(shape, vec);
       if (!ratio.has_value())
         return std::nullopt;
       newShape = ratio.value();
     }

     if (data) {
       auto vec = llvm::to_vector_of<int64_t>(data.asArrayRef());
       if (vec.size() != shape.size())
         return std::nullopt;
       auto ratio = computeShapeRatio(newShape, vec);
       if (!ratio.has_value() && rr)
         ratio = computeShapeRatio(vec, newShape);
       if (!ratio.has_value())
         return std::nullopt;

       // if data is not null, we always return it for next phase.
       newShape = vec;
     }
     return newShape;
   };

   // check the sgLayout and sgData
   auto maybeSgShape =
       tryDistribute(shape, attr.getSgLayout(), attr.getSgData());
   if (!maybeSgShape)
     return false;
   auto sgShape = maybeSgShape.value();

   // check InstData, it neither have layout nor need round-robin
   auto maybeInstShape =
       tryDistribute(sgShape, nullptr, attr.getInstData(), false);
   if (!maybeInstShape)
     return false;
   auto instShape = maybeInstShape.value();

   // check LaneLayout and LaneData
   auto maybeLaneShape =
       tryDistribute(instShape, attr.getLaneLayout(), attr.getLaneData(), false);
   return maybeLaneShape.has_value();
 }

 //===----------------------------------------------------------------------===//
 // XeGPU_BlockTensorDescAttr
 //===----------------------------------------------------------------------===//
 BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context,
                                              xegpu::MemorySpace memory_space,
                                              int array_length,
                                              bool boundary_check) {
   auto scopeAttr = MemorySpaceAttr::get(context, memory_space);
   auto lengthAttr =
       IntegerAttr::get(IntegerType::get(context, 64), array_length);
   auto boundaryAttr = BoolAttr::get(context, boundary_check);
   return Base::get(context, scopeAttr, lengthAttr, boundaryAttr);
 }

 //===----------------------------------------------------------------------===//
 // XeGPU_ScatterTensorDescAttr
 //===----------------------------------------------------------------------===//
 ScatterTensorDescAttr
 ScatterTensorDescAttr::get(mlir::MLIRContext *context,
                            xegpu::MemorySpace memory_space, int chunk_size) {
   auto scopeAttr = MemorySpaceAttr::get(context, memory_space);
   auto chunkSizeAttr =
       IntegerAttr::get(IntegerType::get(context, 64), chunk_size);
   return Base::get(context, scopeAttr, chunkSizeAttr);
 }

 LogicalResult ScatterTensorDescAttr::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
     MemorySpaceAttr memory_space, IntegerAttr chunk_size) {
   int64_t chunkSize = chunk_size.getInt();
   SmallVector<int64_t> supportedChunkSizes = {1,  2,  3,  4,   8,
                                               16, 32, 64, 128, 256};
   if (!llvm::is_contained(supportedChunkSizes, chunkSize))
     return emitError() << "invalid chunk size";

   return success();
 }

 //===----------------------------------------------------------------------===//
 // XeGPU_LayoutAttr
 //===----------------------------------------------------------------------===//
 LogicalResult
 LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                    DenseI32ArrayAttr sg_layout, DenseI32ArrayAttr sg_data,
                    DenseI32ArrayAttr inst_data, DenseI32ArrayAttr lane_layout,
                    DenseI32ArrayAttr lane_data, DenseI32ArrayAttr order) {

   // A valid layout must include at least one of sg_layout and lane_layout.
   // sg_layout is essential for Workgroup layout, while lane_layout is
   // required for Subgroup layout.
   if (!sg_layout && !inst_data && !lane_layout) {
     return emitError()
            << "expected at least one of sg_layout, inst_data or lane_layout";
   }

   // generate code to check sg_laout, inst_data and lane_layout having the same
   // rank if they are not null.

   if (sg_layout && inst_data && sg_layout.size() != inst_data.size()) {
     return emitError()
            << "expected sg_layout and inst_data to have the same rank";
   }

   if (sg_layout && lane_layout && sg_layout.size() != lane_layout.size()) {
     return emitError()
            << "expected sg_layout and lane_layout to have the same rank";
   }

   if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) {
     return emitError()
            << "expected inst_data and lane_layout to have the same rank";
   }

   // sg_data is optional for Workgroup layout, but its presence requires
   // sg_layout.
   if (sg_data) {
     if (!sg_layout)
       return emitError() << "expected sg_layout being used with sg_data";
     if (sg_data.size() != sg_layout.size())
       return emitError()
              << "expected sg_data and sg_layout to have the same rank";
   }

   // lane_data is optional for Subgroup layout, but its presence requires
   // lane_layout.
   if (lane_data) {
     if (!lane_layout)
       return emitError() << "expected lane_layout being used with lane_data";
     if (lane_data.size() != lane_layout.size())
       return emitError()
              << "expected lane_data and lane_layout to have the same rank";
   }

   if (order) {
     if (!sg_layout && !lane_layout)
       return emitError()
              << "expected sg_layout/lane_layout being used with order";

     if (sg_layout && order.size() != sg_layout.size())
       return emitError()
              << "expected order and sg_layout to have the same rank";

     if (lane_layout && order.size() != lane_layout.size())
       return emitError()
              << "expected order and lane_layout to have the same rank";
   }

   return success();
 }

 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescType
 //===----------------------------------------------------------------------===//

 mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
   llvm::SmallVector<int64_t> shape;
   mlir::Type elementType;
   mlir::FailureOr<mlir::Attribute> encoding;
   mlir::FailureOr<mlir::Attribute> layout;

   // Parse literal '<'
   if (parser.parseLess())
     return {};

   auto shapeLoc = parser.getCurrentLocation();
   if (mlir::failed(parser.parseDimensionList(shape))) {
     parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
     return {};
   }

   auto elemTypeLoc = parser.getCurrentLocation();
   if (mlir::failed(parser.parseType(elementType))) {
     parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
     return {};
   }

   // parse optional attributes
   while (mlir::succeeded(parser.parseOptionalComma())) {
     mlir::Attribute attr;
     ParseResult res = parser.parseAttribute(attr);
     if (mlir::succeeded(res)) {
       if (mlir::isa<LayoutAttr>(attr)) {
         layout = attr;
         continue;
       }
       if (mlir::isa<BlockTensorDescAttr, ScatterTensorDescAttr>(attr)) {
         encoding = attr;
         continue;
       }
     }
     return {};
   }

   // Parse literal '>'
   if (parser.parseGreater())
     return {};

   return TensorDescType::getChecked(
       [&]() { return parser.emitError(parser.getNameLoc()); },
       parser.getContext(), shape, elementType,
       encoding.value_or(mlir::Attribute()), layout.value_or(mlir::Attribute()));
 }

 void TensorDescType::print(::mlir::AsmPrinter &printer) const {
   printer << "<";

   auto shape = getShape();
   for (int64_t dim : shape) {
     if (mlir::ShapedType::isDynamic(dim))
       printer << '?';
     else
       printer << dim;
     printer << 'x';
   }

   printer << getElementType();

   if (auto encoding = getEncoding())
     printer << ", " << encoding;

   if (auto layout = getLayout())
     printer << ", " << layout;

   printer << ">";
 }

 TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
                                    mlir::Type elementType, int array_length,
                                    bool boundary_check,
                                    MemorySpace memory_space,
                                    mlir::Attribute layout) {
   auto context = elementType.getContext();
   auto attr = BlockTensorDescAttr::get(context, memory_space, array_length,
                                        boundary_check);
   return Base::get(context, shape, elementType, attr, layout);
 }

 TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
                                    mlir::Type elementType, int chunk_size,
                                    MemorySpace memory_space,
                                    mlir::Attribute layout) {
   auto context = elementType.getContext();
   auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size);
   return Base::get(context, shape, elementType, attr, layout);
 }

 LogicalResult TensorDescType::verify(
     llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
     llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
     mlir::Attribute encoding, mlir::Attribute layout) {
   size_t rank = shape.size();
   if (rank != 1 && rank != 2)
     return emitError() << "expected 1D or 2D tensor";

   auto blockAttr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding);
   if (blockAttr) {
     MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace();
     if (rank == 2 && memorySpaceAttr &&
         memorySpaceAttr.getValue() == MemorySpace::SLM)
       return emitError() << "SLM is not supported for 2D block tensor";
   }

   // for gather and scatter ops, Low-precision types are packed in 32-bit units.
   unsigned bitWidth = elementType.getIntOrFloatBitWidth();
   int chunkAlignmentFactor =
       bitWidth < targetinfo::packedSizeInBitsForGatherScatter
           ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth
           : 1;
   auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding);
   if (scatterAttr) {
     // Expected tensor ranks for scattered data:
     //   - 1D tensor for fully non-contiguous elements (chunk size == 1)
     //   - 2D tensor for scattered blocks (chunk size > 1)
     unsigned chunkSize = scatterAttr.getChunkSize().getInt();
     if (rank == 1 && chunkSize != 1)
       return emitError() << "expected non-contiguous elements for 1D tensor";
     if (rank == 2 && chunkSize < 2)
       return emitError() << "expected chunk blocks for 2D tensor";
     // If chunk size > 1, the second dimension of the tensor shape must be
     // equal to chunk size and it must be a multiple of the packing factor.
     if (chunkSize > 1) {
       if (shape.back() != chunkSize)
         return emitError() << "expected tensor shape[1] to match chunk size";
       if (shape.back() % chunkAlignmentFactor != 0)
         return emitError() << "expected tensor shape[1] to be a multiple of "
                               "chunk alignment factor "
                            << chunkAlignmentFactor;
     }
   }

   auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout);
   if (layoutAttr) {
     if (rank != (size_t)layoutAttr.getRank())
       return emitError() << "expected layout rank to match tensor rank";

     auto laneData = layoutAttr.getLaneData();
     if (scatterAttr && laneData) {
       // Validate subgroup mapping rules for scattered tensors.
       // A work-item's slice of the tensor with shape [sg_size] or
       // [sg_size, chunk_size] will be [1] or [1, 32/element_ty_bit_width]
       // respectively, the mapping should reflect that. This is because each
       // work item access data in 32 bit granularity.

       if (rank > 1 && laneData[0] != 1)
         return emitError()
                << "cannot map over non-contiguous scattered row elements";
       if (laneData[rank - 1] != chunkAlignmentFactor)
         return emitError() << "work item data mapping must match the number of "
                               "contiguous elements";
     }

     if (!XeGPUDialect::isEvenlyDistributable(shape, layoutAttr)) {
       std::string shapeStr;
       llvm::raw_string_ostream stream(shapeStr);
       llvm::interleaveComma(shape, stream);
       return emitError() << "cannot distribute [" << shapeStr << "] using "
                          << layoutAttr;
     }
   }
   return success();
 }

 } // namespace xegpu
 } // namespace mlir

 #include <mlir/Dialect/XeGPU/IR/XeGPUDialect.cpp.inc>
 #define GET_ATTRDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc>
 #define GET_TYPEDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc>
	//===- XeGPUDialect.cpp - MLIR XeGPU dialect implementation ------ C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/Utils/IndexingUtils.h"
	#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
	#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
	#include "mlir/IR/Builders.h"
	#include "mlir/IR/DialectImplementation.h"
	#include "llvm/ADT/TypeSwitch.h"
	#include <numeric>

	using std::optional;

	namespace mlir {
	namespace xegpu {

	void XeGPUDialect::initialize() {
	addTypes<
	#define GET_TYPEDEF_LIST
	#include <mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc>
	>();
	addOperations<
	#define GET_OP_LIST
	#include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc>
	>();
	addAttributes<
	#define GET_ATTRDEF_LIST
	#include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc>
	>();
	}

	// Checks if the given shape can be evenly distributed based on the layout
	// and data factors provided by the LayoutAttr.
	bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape,
	xegpu::LayoutAttr attr) {
	assert(attr && "Layout attribute is missing.");

	// Checks whether the given shape can be evenly distributed using the
	// specified layout and data attributes. If successful, it returns the work
	// size for each compute unit; otherwise, it returns `std::nullopt`. The work
	// size per compute unit is calculated as follows:
	// - If `data` is null: newShape[i] = shape[i] / layout[i]
	// - If `data` is not null: newShape[i] = data[i]
	// When round-robin distribution (`rr`) is enabled, `shape[i]` can be
	// smaller than `layout[i] * data[i]`, allowing multiple compute units to
	// share the data.
	auto tryDistribute = [&](llvm::ArrayRef<int64_t> shape,
	DenseI32ArrayAttr layout, DenseI32ArrayAttr data,
	bool rr = true) -> optional<SmallVector<int64_t>> {
	llvm::SmallVector<int64_t> newShape(shape);
	if (layout) {
	auto vec = llvm::to_vector_of<int64_t>(layout.asArrayRef());
	if (vec.size() != shape.size())
	return std::nullopt;
	auto ratio = computeShapeRatio(shape, vec);
	if (!ratio.has_value())
	return std::nullopt;
	newShape = ratio.value();
	}

	if (data) {
	auto vec = llvm::to_vector_of<int64_t>(data.asArrayRef());
	if (vec.size() != shape.size())
	return std::nullopt;
	auto ratio = computeShapeRatio(newShape, vec);
	if (!ratio.has_value() && rr)
	ratio = computeShapeRatio(vec, newShape);
	if (!ratio.has_value())
	return std::nullopt;

	// if data is not null, we always return it for next phase.
	newShape = vec;
	}
	return newShape;
	};

	// check the sgLayout and sgData
	auto maybeSgShape =
	tryDistribute(shape, attr.getSgLayout(), attr.getSgData());
	if (!maybeSgShape)
	return false;
	auto sgShape = maybeSgShape.value();

	// check InstData, it neither have layout nor need round-robin
	auto maybeInstShape =
	tryDistribute(sgShape, nullptr, attr.getInstData(), false);
	if (!maybeInstShape)
	return false;
	auto instShape = maybeInstShape.value();

	// check LaneLayout and LaneData
	auto maybeLaneShape =
	tryDistribute(instShape, attr.getLaneLayout(), attr.getLaneData(), false);
	return maybeLaneShape.has_value();
	}

	//===----------------------------------------------------------------------===//
	// XeGPU_BlockTensorDescAttr
	//===----------------------------------------------------------------------===//
	BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context,
	xegpu::MemorySpace memory_space,
	int array_length,
	bool boundary_check) {
	auto scopeAttr = MemorySpaceAttr::get(context, memory_space);
	auto lengthAttr =
	IntegerAttr::get(IntegerType::get(context, 64), array_length);
	auto boundaryAttr = BoolAttr::get(context, boundary_check);
	return Base::get(context, scopeAttr, lengthAttr, boundaryAttr);
	}

	//===----------------------------------------------------------------------===//
	// XeGPU_ScatterTensorDescAttr
	//===----------------------------------------------------------------------===//
	ScatterTensorDescAttr
	ScatterTensorDescAttr::get(mlir::MLIRContext *context,
	xegpu::MemorySpace memory_space, int chunk_size) {
	auto scopeAttr = MemorySpaceAttr::get(context, memory_space);
	auto chunkSizeAttr =
	IntegerAttr::get(IntegerType::get(context, 64), chunk_size);
	return Base::get(context, scopeAttr, chunkSizeAttr);
	}

	LogicalResult ScatterTensorDescAttr::verify(
	llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
	MemorySpaceAttr memory_space, IntegerAttr chunk_size) {
	int64_t chunkSize = chunk_size.getInt();
	SmallVector<int64_t> supportedChunkSizes = {1, 2, 3, 4, 8,
	16, 32, 64, 128, 256};
	if (!llvm::is_contained(supportedChunkSizes, chunkSize))
	return emitError() << "invalid chunk size";

	return success();
	}

	//===----------------------------------------------------------------------===//
	// XeGPU_LayoutAttr
	//===----------------------------------------------------------------------===//
	LogicalResult
	LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
	DenseI32ArrayAttr sg_layout, DenseI32ArrayAttr sg_data,
	DenseI32ArrayAttr inst_data, DenseI32ArrayAttr lane_layout,
	DenseI32ArrayAttr lane_data, DenseI32ArrayAttr order) {

	// A valid layout must include at least one of sg_layout and lane_layout.
	// sg_layout is essential for Workgroup layout, while lane_layout is
	// required for Subgroup layout.
	if (!sg_layout && !inst_data && !lane_layout) {
	return emitError()
	<< "expected at least one of sg_layout, inst_data or lane_layout";
	}

	// generate code to check sg_laout, inst_data and lane_layout having the same
	// rank if they are not null.

	if (sg_layout && inst_data && sg_layout.size() != inst_data.size()) {
	return emitError()
	<< "expected sg_layout and inst_data to have the same rank";
	}

	if (sg_layout && lane_layout && sg_layout.size() != lane_layout.size()) {
	return emitError()
	<< "expected sg_layout and lane_layout to have the same rank";
	}

	if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) {
	return emitError()
	<< "expected inst_data and lane_layout to have the same rank";
	}

	// sg_data is optional for Workgroup layout, but its presence requires
	// sg_layout.
	if (sg_data) {
	if (!sg_layout)
	return emitError() << "expected sg_layout being used with sg_data";
	if (sg_data.size() != sg_layout.size())
	return emitError()
	<< "expected sg_data and sg_layout to have the same rank";
	}

	// lane_data is optional for Subgroup layout, but its presence requires
	// lane_layout.
	if (lane_data) {
	if (!lane_layout)
	return emitError() << "expected lane_layout being used with lane_data";
	if (lane_data.size() != lane_layout.size())
	return emitError()
	<< "expected lane_data and lane_layout to have the same rank";
	}

	if (order) {
	if (!sg_layout && !lane_layout)
	return emitError()
	<< "expected sg_layout/lane_layout being used with order";

	if (sg_layout && order.size() != sg_layout.size())
	return emitError()
	<< "expected order and sg_layout to have the same rank";

	if (lane_layout && order.size() != lane_layout.size())
	return emitError()
	<< "expected order and lane_layout to have the same rank";
	}

	return success();
	}

	//===----------------------------------------------------------------------===//
	// XeGPU_TensorDescType
	//===----------------------------------------------------------------------===//

	mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
	llvm::SmallVector<int64_t> shape;
	mlir::Type elementType;
	mlir::FailureOr<mlir::Attribute> encoding;
	mlir::FailureOr<mlir::Attribute> layout;

	// Parse literal '<'
	if (parser.parseLess())
	return {};

	auto shapeLoc = parser.getCurrentLocation();
	if (mlir::failed(parser.parseDimensionList(shape))) {
	parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
	return {};
	}

	auto elemTypeLoc = parser.getCurrentLocation();
	if (mlir::failed(parser.parseType(elementType))) {
	parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
	return {};
	}

	// parse optional attributes
	while (mlir::succeeded(parser.parseOptionalComma())) {
	mlir::Attribute attr;
	ParseResult res = parser.parseAttribute(attr);
	if (mlir::succeeded(res)) {
	if (mlir::isa<LayoutAttr>(attr)) {
	layout = attr;
	continue;
	}
	if (mlir::isa<BlockTensorDescAttr, ScatterTensorDescAttr>(attr)) {
	encoding = attr;
	continue;
	}
	}
	return {};
	}

	// Parse literal '>'
	if (parser.parseGreater())
	return {};

	return TensorDescType::getChecked(
	[&]() { return parser.emitError(parser.getNameLoc()); },
	parser.getContext(), shape, elementType,
	encoding.value_or(mlir::Attribute()), layout.value_or(mlir::Attribute()));
	}

	void TensorDescType::print(::mlir::AsmPrinter &printer) const {
	printer << "<";

	auto shape = getShape();
	for (int64_t dim : shape) {
	if (mlir::ShapedType::isDynamic(dim))
	printer << '?';
	else
	printer << dim;
	printer << 'x';
	}

	printer << getElementType();

	if (auto encoding = getEncoding())
	printer << ", " << encoding;

	if (auto layout = getLayout())
	printer << ", " << layout;

	printer << ">";
	}

	TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
	mlir::Type elementType, int array_length,
	bool boundary_check,
	MemorySpace memory_space,
	mlir::Attribute layout) {
	auto context = elementType.getContext();
	auto attr = BlockTensorDescAttr::get(context, memory_space, array_length,
	boundary_check);
	return Base::get(context, shape, elementType, attr, layout);
	}

	TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
	mlir::Type elementType, int chunk_size,
	MemorySpace memory_space,
	mlir::Attribute layout) {
	auto context = elementType.getContext();
	auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size);
	return Base::get(context, shape, elementType, attr, layout);
	}

	LogicalResult TensorDescType::verify(
	llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
	llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
	mlir::Attribute encoding, mlir::Attribute layout) {
	size_t rank = shape.size();
	if (rank != 1 && rank != 2)
	return emitError() << "expected 1D or 2D tensor";

	auto blockAttr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding);
	if (blockAttr) {
	MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace();
	if (rank == 2 && memorySpaceAttr &&
	memorySpaceAttr.getValue() == MemorySpace::SLM)
	return emitError() << "SLM is not supported for 2D block tensor";
	}

	// for gather and scatter ops, Low-precision types are packed in 32-bit units.
	unsigned bitWidth = elementType.getIntOrFloatBitWidth();
	int chunkAlignmentFactor =
	bitWidth < targetinfo::packedSizeInBitsForGatherScatter
	? targetinfo::packedSizeInBitsForGatherScatter / bitWidth
	: 1;
	auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding);
	if (scatterAttr) {
	// Expected tensor ranks for scattered data:
	// - 1D tensor for fully non-contiguous elements (chunk size == 1)
	// - 2D tensor for scattered blocks (chunk size > 1)
	unsigned chunkSize = scatterAttr.getChunkSize().getInt();
	if (rank == 1 && chunkSize != 1)
	return emitError() << "expected non-contiguous elements for 1D tensor";
	if (rank == 2 && chunkSize < 2)
	return emitError() << "expected chunk blocks for 2D tensor";
	// If chunk size > 1, the second dimension of the tensor shape must be
	// equal to chunk size and it must be a multiple of the packing factor.
	if (chunkSize > 1) {
	if (shape.back() != chunkSize)
	return emitError() << "expected tensor shape[1] to match chunk size";
	if (shape.back() % chunkAlignmentFactor != 0)
	return emitError() << "expected tensor shape[1] to be a multiple of "
	"chunk alignment factor "
	<< chunkAlignmentFactor;
	}
	}

	auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout);
	if (layoutAttr) {
	if (rank != (size_t)layoutAttr.getRank())
	return emitError() << "expected layout rank to match tensor rank";

	auto laneData = layoutAttr.getLaneData();
	if (scatterAttr && laneData) {
	// Validate subgroup mapping rules for scattered tensors.
	// A work-item's slice of the tensor with shape [sg_size] or
	// [sg_size, chunk_size] will be [1] or [1, 32/element_ty_bit_width]
	// respectively, the mapping should reflect that. This is because each
	// work item access data in 32 bit granularity.

	if (rank > 1 && laneData[0] != 1)
	return emitError()
	<< "cannot map over non-contiguous scattered row elements";
	if (laneData[rank - 1] != chunkAlignmentFactor)
	return emitError() << "work item data mapping must match the number of "
	"contiguous elements";
	}

	if (!XeGPUDialect::isEvenlyDistributable(shape, layoutAttr)) {
	std::string shapeStr;
	llvm::raw_string_ostream stream(shapeStr);
	llvm::interleaveComma(shape, stream);
	return emitError() << "cannot distribute [" << shapeStr << "] using "
	<< layoutAttr;
	}
	}
	return success();
	}

	} // namespace xegpu
	} // namespace mlir

	#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.cpp.inc>
	#define GET_ATTRDEF_CLASSES
	#include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc>
	#define GET_TYPEDEF_CLASSES
	#include <mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc>