| //===- XeGPUOps.td - XeGPU dialect operations definition ----*- tablegen -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD |
| #define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD |
| |
| include "mlir/Dialect/Arith/IR/ArithBase.td" |
| include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td" |
| include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" |
| include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td" |
| include "mlir/Interfaces/ShapedOpInterfaces.td" |
| include "mlir/Interfaces/SideEffectInterfaces.td" |
| include "mlir/Interfaces/ViewLikeInterface.td" |
| |
| // Base class for dialect operations. This operation inherits from the base |
| // `Op` class in OpBase.td, and provides: |
| // * The parent dialect of the operation. |
| // * The mnemonic for the operation, or the name without the dialect prefix. |
| // * A list of traits for the operation. |
| class XeGPU_Op<string mnemonic, list<Trait> traits = []>: |
| Op<XeGPU_Dialect, mnemonic, traits> { |
| |
| code extraBaseClassDeclaration = [{ |
| void printProperties(::mlir::MLIRContext *ctx, |
| ::mlir::OpAsmPrinter &p, const Properties &prop, |
| ::mlir::ArrayRef<::llvm::StringRef> elidedProps) { |
| Attribute propAttr = getPropertiesAsAttr(ctx, prop); |
| if (propAttr) |
| p << "<" << propAttr << ">"; |
| } |
| |
| static ::mlir::ParseResult parseProperties(::mlir::OpAsmParser &parser, |
| ::mlir::OperationState &result) { |
| if (mlir::succeeded(parser.parseOptionalLess())) { |
| if (parser.parseAttribute(result.propertiesAttr) || parser.parseGreater()) |
| return failure(); |
| } |
| return success(); |
| } |
| |
| }]; |
| } |
| |
| |
| def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, |
| AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> { |
| |
| let summary = "Create nd-tensor descriptor operation"; |
| let description = [{ |
| The "create_nd_tdesc" operation creates a TensorDescType which represents |
| a sub-view of a 2D memory region (It can be extended to support n-D memory |
| region if needed in future). Elements in the subview continuous in each |
| dimension. It encodes the following important information for supporting |
| Intel hardware features: |
| |
| * source: an object representing (starting address/pointer of) a 2D memory region. |
| It can be either a 2D memref object, or simply a pointer represented by uint64_t type. |
| for the later case, the shape and layout information of the 2D memory region should |
| be explicitly passed via `shape` and `strides` parameters. |
| * offsets: two index values represents offsets from the "source" at the each dimension |
| at which the subview of the target memory will be created. It is encoded via two |
| variables, including "offsets" and "const_offsets", such that it can |
| accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]). |
| * shape: the shape information of the memory region pointed by the "source". It is |
| typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. |
| But if "source" is simply a pointer represented as uint64_t type, or a memref |
| type without shape information e.g., memref<?x?xf16>, the shape information has |
| to be explicitly passed via the "shape" and "const_shape" arguments. |
| * strides: the strides of the memory region pointed by the "source". Similar to shape, |
| it is typically encoded via the MemRefType of the source too. But if "source" is |
| simply a pointer represented as uint64_t type, or a memref type without shape |
| information e.g., memref<?x?xf16>, the strides information has to be explicitly |
| passed via the "strides" and "const_strides" argument. |
| |
| Example 1 (suppose the tensor shape inferred by the compiler is 8x16): |
| %0 = memref.alloc() : memref<1024x1024xf32> |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32> |
| |
| Example 2 (suppose the tensor shape inferred by the compiler is 8x16): |
| %0 = memref.alloc(%h, %w) : memref<?x?xf32> |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32> |
| |
| Example 3 (suppose the tensor shape inferred by the compiler is 8x16): |
| %0 = ... : ui64 |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32> |
| }]; |
| |
| let arguments = (ins |
| XeGPU_BaseAddrType: $source, |
| Variadic<Index>: $offsets, |
| Variadic<Index>: $shape, |
| Variadic<Index>: $strides, |
| DenseI64ArrayAttr: $const_offsets, |
| OptionalAttr<DenseI64ArrayAttr>: $const_shape, |
| OptionalAttr<DenseI64ArrayAttr>: $const_strides |
| ); |
| let results = (outs XeGPU_TensorDesc: $TensorDesc); |
| |
| let assemblyFormat = [{ |
| $source `` |
| custom<DynamicIndexList>($offsets, $const_offsets) |
| (`,` custom<DynamicIndexList>($shape, $const_shape)^ |
| `,` custom<DynamicIndexList>($strides, $const_strides))? |
| attr-dict `:` type($source) `->` qualified(type($TensorDesc)) |
| }]; |
| |
| let hasVerifier = 1; |
| |
| let builders = [ |
| OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source, |
| "llvm::ArrayRef<OpFoldResult>": $offsets)>, |
| |
| OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source, |
| "llvm::ArrayRef<OpFoldResult>": $offsets, |
| "llvm::ArrayRef<OpFoldResult>": $shape, |
| "llvm::ArrayRef<OpFoldResult>": $strides)> |
| ]; |
| |
| let extraClassDeclaration = extraBaseClassDeclaration # [{ |
| /// Returns the type of the source memref operand. |
| Type getSourceType() { |
| return getSource().getType(); |
| } |
| |
| /// Returns the type of the result TensorDesc. |
| xegpu::TensorDescType getType() { |
| return getTensorDesc().getType(); |
| } |
| |
| /// Return the element type of the TensorDesc |
| Type getElementType() { |
| return getType().getElementType(); |
| } |
| |
| /// Return the shape of the TensorDesc |
| llvm::ArrayRef<int64_t> getTensorDescShape() { |
| return getType().getShape(); |
| } |
| |
| /// wrapper for matching with OffsetSizeAndStrideOpInterface |
| OperandRange getSizes() { |
| return getShape(); |
| } |
| |
| ArrayRef<int64_t> getStaticOffsets(){ |
| return getConstOffsets(); |
| } |
| |
| /// wrapper for matching with OffsetSizeAndStrideOpInterface |
| /// If source is IntegerType or `const_shape` is filled, |
| /// it will return `const_shape`, such that mixes of `shape` |
| /// and `const_shape` will be used to represent the shape of |
| /// source operand. They overide static shape from source memref type. |
| ArrayRef<int64_t> getStaticSizes() { |
| auto attr = getConstShapeAttr(); |
| if (llvm::isa<IntegerType>(getSourceType()) || attr) |
| return attr; |
| |
| auto memrefType = llvm::dyn_cast<MemRefType>(getSourceType()); |
| assert(memrefType && "Incorrect use of getStaticSizes"); |
| return memrefType.getShape(); |
| } |
| |
| /// wrapper for matching with OffsetSizeAndStrideOpInterface |
| /// If source is IntegerType or `const_strides` is filled, it |
| /// will return `const_strides`, such that mixes of `strides` |
| /// and `const_strides` will be used to represent the strides of |
| /// source operand. They overide static strides from source memref type. |
| ArrayRef<int64_t> getStaticStrides() { |
| auto attr = getConstStridesAttr(); |
| if (llvm::isa<IntegerType>(getSourceType()) || attr) |
| return attr; |
| |
| auto memrefType = llvm::dyn_cast<MemRefType>(getSourceType()); |
| assert(memrefType && "Incorrect use of getStaticStrides"); |
| auto [strides, offset] = getStridesAndOffset(memrefType); |
| // reuse the storage of ConstStridesAttr since strides from |
| // memref is not persistant |
| setConstStrides(strides); |
| attr = getConstStridesAttr(); |
| return attr; |
| } |
| |
| /// Return the expected rank of each of the`static_offsets`, |
| /// `static_shape` and `static_strides` attributes. |
| std::array<unsigned, 3> getArrayAttrMaxRanks() { |
| unsigned rank; |
| if (auto ty = llvm::dyn_cast<MemRefType>(getSourceType())) { |
| rank = ty.getRank(); |
| } else { |
| rank = (unsigned)getMixedOffsets().size(); |
| } |
| return {rank, rank, rank}; |
| } |
| |
| /// Return the number of leading operands before the `offsets`, |
| /// `shape` and `strides` operands. |
| static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; } |
| |
| mlir::Value getViewSource() { return getSource(); } |
| }]; |
| } |
| |
| def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { |
| let summary = "prefetches a n-D block to cache"; |
| let description = [{ |
| It issues an instruction to prefetch a block of data from continuous |
| memory regions to each level of the cache based on their cache policy. |
| |
| Example: |
| ``` |
| xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>, |
| l2_hint = #xegpu.cache_hint<cached>, |
| l3_hint = #xegpu.cache_hint<cached>} |
| : !xegpu.tensor_desc<8x16xf16> |
| ``` |
| |
| }]; |
| |
| let arguments = (ins XeGPU_TensorDesc: $TensorDesc, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint); |
| |
| let extraClassDeclaration = extraBaseClassDeclaration # [{ |
| xegpu::TensorDescType getTensorDescType() { |
| return getTensorDesc().getType(); |
| } |
| }]; |
| |
| let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))"; |
| |
| let hasVerifier = 1; |
| } |
| |
| |
| def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "TensorDesc"]>, |
| AllElementCountsMatch<["value", "TensorDesc"]>]> { |
| let summary = "loads a n-D block from memory (represented by TensorDesc)" |
| "to registers (represented by vector)"; |
| let description = [{ |
| LoadNdOp essentially mimics the hardware block read instruction to read |
| a block of data from memory to register. It takes a set of optional cache |
| hints for each level of cache, L1, L2 and L3. If hardware does not have a |
| correspoding cache, Corresponding cache hint attribute will be masked. |
| VNNI transformation is an hardware feature for Intel GPU, which is used to |
| do data packing during the load for B operand of matrix operation, if |
| the bit width of the data type is less then 32 bits, e.g., fp16. And |
| transpose is another Intel hardware feature, which will do transpose |
| operation when loading the data if the bit width of the data type is |
| fp32 or fp64. It implies that vnni and transpose cannot exit at the |
| same time. |
| |
| Example: |
| ``` |
| xegpu.load_nd %1 {transpose = [1, 0], |
| l1_hint = #xegpu.cache_hint<cached>, |
| l2_hint = #xegpu.cache_hint<uncached>, |
| l3_hint = #xegpu.cache_hint<streaming>} |
| : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> |
| ``` |
| |
| |
| }]; |
| |
| let arguments = (ins XeGPU_TensorDesc: $TensorDesc, |
| OptionalAttr<I64Attr>: $vnni_axis, |
| OptionalAttr<DenseI64ArrayAttr>: $transpose, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint); |
| |
| let results = (outs XeGPU_ValueType: $value); |
| |
| let extraClassDeclaration = extraBaseClassDeclaration # [{ |
| VectorType getType() { |
| return llvm::dyn_cast<VectorType>(getValue().getType()); |
| } |
| |
| xegpu::TensorDescType getTensorDescType() { |
| return getTensorDesc().getType(); |
| } |
| }]; |
| |
| let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)"; |
| let hasVerifier = 1; |
| } |
| |
| def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllShapesMatch<["value", "TensorDesc"]>, |
| AllElementTypesMatch<["value", "TensorDesc"]>]> { |
| let summary = "stores a n-D block register region back to memory, currently only supports 2D"; |
| |
| let description = [{ |
| StoreNdOp essentially mimics the hardware block write instruction io |
| write a block of data from register into the memory region as described |
| by the TensorDesc. It takes a set of optional cache hints for each level |
| of cache, L1, L2 and L3. If hardware does not have a correspoding cache, |
| Corresponding cache hint attribute will be masked. |
| |
| Example: |
| ``` |
| xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>, |
| l2_hint = #xegpu.cache_hint<write_back>, |
| l3_hint = #xegpu.cache_hint<write_through>} |
| : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> |
| ``` |
| |
| |
| }]; |
| |
| let arguments = (ins XeGPU_ValueType: $value, |
| XeGPU_TensorDesc: $TensorDesc, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint); |
| |
| let extraClassDeclaration = extraBaseClassDeclaration # [{ |
| VectorType getValueType() { |
| return llvm::dyn_cast<VectorType>(getValue().getType()); |
| } |
| |
| xegpu::TensorDescType getTensorDescType() { |
| return getTensorDesc().getType(); |
| } |
| }]; |
| |
| let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict |
| `:` type($value) `,` qualified(type($TensorDesc))}]; |
| let hasVerifier = 1; |
| } |
| |
| def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset", |
| [AllTypesMatch<["TensorDesc", "result"]>]> { |
| let summary = "It updates the offsets for the TensorDesc."; |
| let description = [{The op updates the offset of the given TensorDesc. |
| The offsets are relative offset to the current position in the number |
| of elements. It will result in a same type TensorDesc as the input. |
| |
| example: |
| ``` |
| %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32> |
| ``` |
| }]; |
| |
| let arguments = (ins |
| XeGPU_TensorDesc: $TensorDesc, |
| Variadic<Index>: $offsets, |
| DenseI64ArrayAttr: $const_offsets); |
| |
| let results = (outs XeGPU_TensorDesc: $result); |
| |
| let extraClassDeclaration = extraBaseClassDeclaration # [{ |
| xegpu::TensorDescType getTensorDescType() { |
| return getTensorDesc().getType(); |
| } |
| |
| SmallVector<OpFoldResult> getMixedOffsets() { |
| Builder b(getContext()); |
| return getMixedValues(getConstOffsets(), getOffsets(), b); |
| } |
| |
| size_t getNumOffsets() { |
| return getMixedOffsets().size(); |
| } |
| |
| OpFoldResult getOffset(unsigned idx) { |
| assert(idx < getNumOffsets() && "Invalid out of bound access."); |
| return getMixedOffsets()[idx]; |
| } |
| }]; |
| |
| let assemblyFormat = [{ |
| $TensorDesc `,` |
| custom<DynamicIndexList>($offsets, $const_offsets) |
| attr-dict `:` qualified(type($result)) |
| }]; |
| |
| let hasVerifier = 1; |
| } |
| |
| def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { |
| let summary = "create scattered tensor descriptors (TensorDesc)."; |
| let description = [{ |
| "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates |
| a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc" |
| is for creating continuous subviews, "create_tdesc" is for creating non-continuous |
| (scattered) subviews, allowing each work-item in a subgroup specifying their own offset. |
| It accepts the following parameters: |
| |
| * source: a 1D memref or pointer (uint64_t) represents the flattened memory object. |
| * offsets: a array containing offsets of each access point. Its size |
| is fixed to the hardware supportted subgroup size, e.g., 16 on PVC, |
| implying each element in the array corresponds to a work-item (SIMT lane) |
| in the subgroup. |
| * chunk_size: [optional attribute] indicates number of continious |
| elements accessed for each offset, default is 1. |
| |
| Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] |
| ``` |
| %a = memref.alloc() : memref<1024xf32> |
| %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32> |
| ``` |
| |
| Example 2. It assumes subgroup size is 4, and each workitem access 8 elements. |
| It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71] |
| ``` |
| %0 = memref.alloc() : memref<1024xf32> |
| %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> |
| ``` |
| |
| Example 3. It is similar to Example 2, but there is some overlaps among workitems. |
| It accesses: a[0:7], a[4:11], a[8:15], a[12:19] |
| ``` |
| %0 = memref.alloc() : memref<1024xf32> |
| %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> |
| ``` |
| }]; |
| |
| let arguments = (ins XeGPU_BaseAddrType: $source, |
| Variadic<Index>: $offsets, |
| DenseI64ArrayAttr: $const_offsets, |
| DefaultValuedAttr<I64Attr, "1">: $chunk_size); |
| let results = (outs XeGPU_TensorDesc:$TensorDesc); |
| |
| let builders = [ |
| OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source, |
| "llvm::ArrayRef<OpFoldResult>": $offsets, |
| CArg<"uint32_t", "1"> : $chunk_size)>, |
| ]; |
| |
| let assemblyFormat = [{ |
| $source |
| custom<DynamicIndexList>($offsets, $const_offsets) |
| attr-dict `:` type($source) `->` qualified(type($TensorDesc)) |
| }]; |
| |
| let extraClassDeclaration = extraBaseClassDeclaration # [{ |
| xegpu::TensorDescType getTensorDescType() { |
| return getTensorDesc().getType(); |
| } |
| |
| SmallVector<OpFoldResult> getMixedOffsets() { |
| Builder b(getContext()); |
| return getMixedValues(getConstOffsets(), getOffsets(), b); |
| } |
| |
| size_t getNumOffsets() { |
| return getMixedOffsets().size(); |
| } |
| |
| mlir::Value getViewSource() { return getSource(); } |
| |
| OpFoldResult getOffset(unsigned idx) { |
| assert(idx < getNumOffsets() && "Invalid out of bound access."); |
| return getMixedOffsets()[idx]; |
| } |
| }]; |
| |
| let hasVerifier = 1; |
| } |
| |
| def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { |
| let summary = "prefetches a set of scattered data points to cache"; |
| |
| let description = [{ |
| It issues instructions to prefetch a set of scattered data points |
| from memory to each level of the cache based on their cache policy. |
| As compared to prefetch_nd, which works on non-scattered TensorDesc, |
| it works on scattered TensorDesc instead. |
| |
| Example: |
| ``` |
| xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint<cached>, |
| l2_hint = #xegpu.cache_hint<cached>, |
| l3_hint = #xegpu.cache_hint<cached>} |
| : !xegpu.tensor_desc<16xf16> |
| ``` |
| |
| }]; |
| |
| let arguments = (ins XeGPU_TensorDesc: $TensorDesc, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint); |
| |
| let extraClassDeclaration = extraBaseClassDeclaration # [{ |
| xegpu::TensorDescType getTensorDescType() { |
| return getTensorDesc().getType(); |
| } |
| }]; |
| |
| let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))"; |
| |
| let hasVerifier = 1; |
| } |
| |
| def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]>, |
| AllElementTypesMatch<["value", "TensorDesc"]>, |
| AllElementCountsMatch<["value", "TensorDesc"]>]> { |
| let summary = "load a set of scattered data points from memory."; |
| |
| let description = [{ It (aka. load) load data per each work-item. The output |
| describes the data being loaded at the subgroup level, so its size is |
| consistent with the number of work-items in a subgroup. When `chunk_size_per_lane` |
| attribute is larger than 1 in TensorDesc, the output vector will be 2D vector, |
| with dim-1 correspoding to the chunk size. |
| |
| The mask operand masks out memory access so that it is safe to pass out-of-boundary |
| addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. |
| |
| Example: |
| ``` |
| %2 = xegpu.load %1, %0 {transpose = [1, 0], |
| l1_hint = #xegpu.cache_hint<cached>, |
| l2_hint = #xegpu.cache_hint<uncached>, |
| l3_hint = #xegpu.cache_hint<uncached>} |
| : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1> |
| -> vector<16xf32> |
| ``` |
| |
| }]; |
| |
| let arguments = (ins XeGPU_TensorDesc: $TensorDesc, |
| XeGPU_MaskType: $mask, |
| OptionalAttr<DenseI64ArrayAttr>: $transpose, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint); |
| let results = (outs XeGPU_ValueType: $value); |
| |
| let extraClassDeclaration = extraBaseClassDeclaration # [{ |
| xegpu::TensorDescType getTensorDescType() { |
| return getTensorDesc().getType(); |
| } |
| |
| mlir::Type getElementType() { |
| auto type = getValue().getType(); |
| return getElementTypeOrSelf(type); |
| } |
| |
| Type getValueType() { |
| return getValue().getType(); |
| } |
| |
| Type getMaskType() { |
| return getMask().getType(); |
| } |
| |
| }]; |
| |
| let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict |
| `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}]; |
| |
| let hasVerifier = 1; |
| } |
| |
| def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>, |
| AllElementTypesMatch<["value", "TensorDesc"]>]> { |
| let summary = "store data to scattered memory locations."; |
| let description = [{ It (aka. store) stores data to scattered memory locations. |
| It has similar semantic to `load_gather`. |
| |
| Example: |
| ``` |
| %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>, |
| l2_hint = #xegpu.cache_hint<write_back>, |
| l3_hint = #xegpu.cache_hint<write_through>} |
| : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1> |
| ``` |
| }]; |
| |
| let arguments = (ins |
| XeGPU_ValueType: $value, |
| XeGPU_TensorDesc: $TensorDesc, |
| XeGPU_MaskType: $mask, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint, |
| OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint); |
| |
| let extraClassDeclaration = extraBaseClassDeclaration # [{ |
| xegpu::TensorDescType getTensorDescType() { |
| return getTensorDesc().getType(); |
| } |
| |
| Type getValueType() { |
| return getValue().getType(); |
| } |
| |
| Type getMaskType() { |
| return getMask().getType(); |
| } |
| }]; |
| |
| let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict |
| `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}]; |
| |
| let hasVerifier = 1; |
| } |
| |
| def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", |
| [AllTypesMatch<["TensorDesc", "result"]>]> { |
| let summary = "It updates the offsets for the given tensor descriptor"; |
| |
| let description = [{It behaves similar to `update_nd_offset` in terms that |
| it updates offset of a TensorDesc, and the offsets are relative offset to |
| the current position in the number of elements. However, `update_nd_offset` |
| is to update the start point of a 2D block, so its offset constains two |
| elements representing the shift in each dimension. `update_offset` is to |
| update the offset per work-item, so its offsets contains values representing |
| shifts for each work-item. |
| |
| Example: |
| ``` |
| %2 = xegpu.update_offset %1, [32, 32, 32, 32] |
| : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>> |
| ``` |
| }]; |
| |
| let arguments = (ins XeGPU_TensorDesc: $TensorDesc, |
| Variadic<Index>: $offsets, |
| DenseI64ArrayAttr: $const_offsets); |
| let results = (outs XeGPU_TensorDesc: $result); |
| |
| let extraClassDeclaration = extraBaseClassDeclaration # [{ |
| xegpu::TensorDescType getTensorDescType() { |
| return getTensorDesc().getType(); |
| } |
| |
| SmallVector<OpFoldResult> getMixedOffsets() { |
| Builder b(getContext()); |
| return getMixedValues(getConstOffsets(), getOffsets(), b); |
| } |
| |
| size_t getNumOffsets() { |
| return getMixedOffsets().size(); |
| } |
| |
| OpFoldResult getOffset(unsigned idx) { |
| assert(idx < getNumOffsets() && "Invalid out of bound access."); |
| return getMixedOffsets()[idx]; |
| } |
| }]; |
| |
| let assemblyFormat = [{ |
| $TensorDesc `,` |
| custom<DynamicIndexList>($offsets, $const_offsets) |
| attr-dict `:` qualified(type($TensorDesc)) |
| }]; |
| } |
| |
| def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]> { |
| let summary = "It performs mma computation"; |
| |
| let description = [{DPAS performs matrix multiplication on matrix A of `mxk` |
| size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size |
| matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16 |
| data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`, |
| and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS |
| also requires A and B to be loaded with the required data layout. Specially, |
| VNNI layout is required for B operand. It is achieved via setting `vnni_axis = 0` |
| of the corresponding `load_nd` operator. To keep both operands as 3D vector, |
| operand A is loaded via setting `vnni_axis = 1` without impacting the |
| physical layouts change in register. Due to the VNNI transformation, A and B operands |
| are represented as 3D vector, with the last dimension representing the VNNI factor, |
| which is computed as `32/bit_width_of_elem_type`. Therefore, `A: vector<8x16xf16>` |
| is represented as `A: vector<8x8x2xf16>`, and `B: vector<16x16xf16>` is |
| represented as `B: vector<8x16x2xf16>`. |
| |
| Note: on PVC, the hardware can perform load with VNNI transformation when data |
| element type is 16-bit or lower precision, taking 2 or 4 elements from |
| the first dimension and inserted into the newly added innermost dimension. |
| }]; |
| |
| let arguments = (ins |
| XeGPU_DpasOpType : $lhs, |
| XeGPU_DpasOpType : $rhs, |
| Optional<XeGPU_Vector2DType>: $acc); |
| let results = (outs XeGPU_Vector2DType: $result); |
| |
| let extraClassDeclaration = [{ |
| VectorType getLhsType() { |
| return getLhs().getType(); |
| } |
| |
| VectorType getRhsType() { |
| return getRhs().getType(); |
| } |
| |
| VectorType getAccType() { |
| if (getAcc()) |
| return getAcc().getType(); |
| return {}; |
| } |
| |
| VectorType getResultType() { |
| return getResult().getType(); |
| } |
| }]; |
| |
| let assemblyFormat = [{ |
| $lhs `,` $rhs (`,` $acc^)? attr-dict `:` type($lhs)`,` type($rhs) (`,` type($acc)^)? `->` type($result) |
| }]; |
| |
| let hasVerifier = 1; |
| } |
| |
| def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, |
| AllElementTypesMatch<["tensorDesc", "value", "result"]>, |
| AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> { |
| let summary = "Atomic ready-modify-write operation on the TensorDesc. "; |
| |
| let description = [{ |
| The `xegpu.atomic_rmw` operation provides a way to perform a read-modify-write |
| operation on the region described by the `TensorDesc` free from data races. The |
| `kind` enumeration specifies the modification to be performed, The `mask` operand |
| has the same shape with `TensorDesc`, and is used to enable or disable specific |
| data points of the `TensorDesc`. The `value` operand represents the new value to |
| be applied during the modification. |
| }]; |
| |
| let arguments = (ins |
| AtomicRMWKindAttr:$kind, |
| XeGPU_TensorDesc:$tensorDesc, |
| XeGPU_MaskType:$mask, |
| XeGPU_ValueType:$value); |
| |
| let results = (outs XeGPU_ValueType:$result); |
| |
| let assemblyFormat = [{ |
| $kind $tensorDesc `,` $mask `,` $value attr-dict `:` |
| type($tensorDesc) `,` type($mask) `,` type($value) `->` type($result) |
| }]; |
| } |
| |
| def XeGPU_AllocNbarrierOp: XeGPU_Op<"alloc_nbarrier", []> { |
| let summary = "It allocates a set of named barriers."; |
| let description = [{AllocNbarrier is to create a set of named barriers as |
| specified by `nbarrier_num`. Named barriers are workgroup level resources, |
| and are shared by all threads in the workgroup. For example, there are |
| up to 32 barriers (range 0-31) for each XeCore on PVC. A typical use case |
| is that a workgroup is partitioned into N subgroups of threads (N <= 32), |
| and each subgroup coordinating their work with a separate barrier with id |
| range from 0 to N respectively.}]; |
| let arguments = (ins I64Attr: $nbarrier_num); |
| let assemblyFormat = "$nbarrier_num attr-dict"; |
| } |
| |
| def XeGPU_InitNbarrierOp: XeGPU_Op<"init_nbarrier", []> { |
| let summary = "It assigns a named barrier to the current thread."; |
| let description = [{InitNbarrierOp assigns the named barrier with the specified |
| barrier ID (0~31) to the current thread. Multiple threads may bind to the |
| same named barrier, and the `participant_thread_num` specifies the total |
| number of threads associated with the nbarrier. It returns an object of |
| NbarrierType representing the barrier}]; |
| |
| let arguments = (ins I8: $nbarrier_id, |
| I8: $participant_thread_num); |
| let results = (outs XeGPU_Nbarrier: $result); |
| let assemblyFormat = [{ |
| $nbarrier_id `,` $participant_thread_num attr-dict `:` |
| type($nbarrier_id) `,` type($participant_thread_num) `->` qualified(type($result)) |
| }]; |
| } |
| |
| def XeGPU_NbarrierArriveOp: XeGPU_Op<"nbarrier_arrive", []> { |
| let summary = "It signals the arrival at the named barrier."; |
| let description = [{NbarrierArriveOp signals the hardware (or other threads) |
| that the current thread has produced its data for the consumer threads. When |
| the hardware signalled by `participant_thread_num` threads for the named barrier, |
| it will notify the threads waiting for the named barrier to continue their work.}]; |
| |
| let arguments = (ins XeGPU_Nbarrier: $nbarrier); |
| let assemblyFormat = [{ $nbarrier attr-dict `:` qualified(type($nbarrier))}]; |
| } |
| |
| def XeGPU_NbarrierWaitOp: XeGPU_Op<"nbarrier_wait", []> { |
| let summary = "It waits for a named barrier."; |
| let description = [{NbarrierWaitOp signals the hardware which named barrier |
| the current thread is waiting for, such that it can get notified when the |
| named barrier is completed.}]; |
| let arguments = (ins XeGPU_Nbarrier: $nbarrier); |
| let assemblyFormat = [{ $nbarrier attr-dict `:` qualified(type($nbarrier)) }]; |
| } |
| |
| def XeGPU_FenceOp: XeGPU_Op<"fence", []> { |
| let summary = "It synchronizes memory accesses."; |
| let description = [{It synchronizes the memory access between |
| write and following read or write. |
| 1. `Memory_kind` describes the memory kind. "global" means the global memory, |
| "slm" means the share local memory. |
| 2. `Fence_scope` describes the scope of fence. "Workgroup" means that the scope would be |
| within each workgroup. "GPU" means the scope would be across workgroups within the GPU. |
| }]; |
| let arguments = (ins XeGPU_MemoryScopeAttr: $memory_kind, |
| XeGPU_FenceScopeAttr: $fence_scope); |
| let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}]; |
| let extraClassDeclaration = extraBaseClassDeclaration; |
| } |
| |
| #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD |