| //===-- GPUOps.td - GPU dialect operation definitions ------*- tablegen -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // Defines some operations of the GPU dialect. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef GPU_OPS |
| #define GPU_OPS |
| |
| include "mlir/Dialect/DLTI/DLTIBase.td" |
| include "mlir/Dialect/GPU/GPUBase.td" |
| include "mlir/Dialect/LLVMIR/LLVMOpBase.td" |
| include "mlir/IR/SymbolInterfaces.td" |
| include "mlir/Interfaces/DataLayoutInterfaces.td" |
| include "mlir/Interfaces/InferTypeOpInterface.td" |
| include "mlir/Interfaces/SideEffectInterfaces.td" |
| |
| //===----------------------------------------------------------------------===// |
| // GPU Dialect operations. |
| //===----------------------------------------------------------------------===// |
| |
| class GPU_Op<string mnemonic, list<OpTrait> traits = []> : |
| Op<GPU_Dialect, mnemonic, traits>; |
| |
| class GPU_IndexOp<string mnemonic, list<OpTrait> traits = []> : |
| GPU_Op<mnemonic, !listconcat(traits, [NoSideEffect])>, |
| Arguments<(ins StrAttr:$dimension)>, Results<(outs Index)> { |
| let verifier = [{ return ::verifyIndexOp(*this); }]; |
| } |
| |
| def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> { |
| let description = [{ |
| Returns the number of threads in the thread block (aka the block size) along |
| the x, y, or z `dimension`. |
| |
| Example: |
| |
| ```mlir |
| %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index) |
| ``` |
| }]; |
| } |
| def GPU_BlockIdOp : GPU_IndexOp<"block_id"> { |
| let description = [{ |
| Returns the block id, i.e. the index of the current block within the grid |
| along the x, y, or z `dimension`. |
| |
| Example: |
| |
| ```mlir |
| %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index) |
| ``` |
| }]; |
| } |
| def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> { |
| let description = [{ |
| Returns the number of thread blocks in the grid along the x, y, or z |
| `dimension`. |
| |
| Example: |
| |
| ```mlir |
| %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) |
| ``` |
| }]; |
| } |
| def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> { |
| let description = [{ |
| Returns the thread id, i.e. the index of the current thread within the block |
| along the x, y, or z `dimension`. |
| |
| Example: |
| |
| ```mlir |
| %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) |
| ``` |
| }]; |
| } |
| |
| def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [NoSideEffect]>, |
| Arguments<(ins)>, Results<(outs Index:$result)> { |
| let description = [{ |
| Returns the subgroup id, i.e. the index of the current subgroup within the |
| workgroup. |
| |
| Example: |
| |
| ```mlir |
| %sgId = gpu.subgroup_id : index |
| ``` |
| }]; |
| |
| let assemblyFormat = "attr-dict `:` type($result)"; |
| let verifier = [{ return success(); }]; |
| } |
| |
| def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [NoSideEffect]>, |
| Arguments<(ins)>, Results<(outs Index:$result)> { |
| let description = [{ |
| Returns the number of subgroups within a workgroup. |
| |
| Example: |
| |
| ```mlir |
| %numSg = gpu.num_subgroups : index |
| ``` |
| }]; |
| |
| let assemblyFormat = "attr-dict `:` type($result)"; |
| let verifier = [{ return success(); }]; |
| } |
| |
| def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [NoSideEffect]>, |
| Arguments<(ins)>, Results<(outs Index:$result)> { |
| let description = [{ |
| Returns the number of threads within a subgroup. |
| |
| Example: |
| |
| ```mlir |
| %sgSz = gpu.subgroup_size : index |
| ``` |
| }]; |
| |
| let assemblyFormat = "attr-dict `:` type($result)"; |
| let verifier = [{ return success(); }]; |
| } |
| |
| def GPU_GPUFuncOp : GPU_Op<"func", [HasParent<"GPUModuleOp">, |
| AutomaticAllocationScope, FunctionLike, |
| IsolatedFromAbove, Symbol]> { |
| let summary = "Function executable on a GPU"; |
| |
| let description = [{ |
| Defines a function that can be executed on a GPU. This supports memory |
| attribution and its body has a particular execution model. |
| |
| GPU functions are either kernels (as indicated by the `kernel` attribute) or |
| regular functions. The former can be launched from the host side, while the |
| latter are device side only. |
| |
| The memory attribution defines SSA values that correspond to memory buffers |
| allocated in the memory hierarchy of the GPU (see below). |
| |
| The operation has one attached region that corresponds to the body of the |
| function. The region arguments consist of the function arguments without |
| modification, followed by buffers defined in memory annotations. The body of |
| a GPU function, when launched, is executed by multiple work items. There are |
| no guarantees on the order in which work items execute, or on the connection |
| between them. In particular, work items are not necessarily executed in |
| lock-step. Synchronization ops such as "gpu.barrier" should be used to |
| coordinate work items. Declarations of GPU functions, i.e. not having the |
| body region, are not supported. |
| |
| Syntax: |
| |
| ``` |
| op ::= `gpu.func` symbol-ref-id `(` argument-list `)` (`->` |
| function-result-list)? |
| memory-attribution `kernel`? function-attributes? region |
| |
| memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? |
| (`private` `(` ssa-id-and-type-list `)`)? |
| ``` |
| |
| Example: |
| |
| ```mlir |
| gpu.func @foo(%arg0: index) |
| workgroup(%workgroup: memref<32xf32, 3>) |
| private(%private: memref<1xf32, 5>) |
| kernel |
| attributes {qux: "quux"} { |
| gpu.return |
| } |
| ``` |
| |
| The generic form illustrates the concept |
| |
| ```mlir |
| "gpu.func"(%arg: index) {sym_name: "foo", kernel, qux: "quux"} ({ |
| ^bb0(%arg0: index, %workgroup: memref<32xf32, 3>, |
| %private: memref<1xf32, 5>): |
| "gpu.return"() : () -> () |
| }) : (index) -> () |
| ``` |
| |
| Note the non-default memory spaces used in memref types in memory |
| attribution. |
| }]; |
| |
| let regions = (region AnyRegion:$body); |
| |
| let skipDefaultBuilders = 1; |
| |
| let builders = [ |
| OpBuilder<(ins "StringRef":$name, "FunctionType":$type, |
| CArg<"TypeRange", "{}">:$workgroupAttributions, |
| CArg<"TypeRange", "{}">:$privateAttributions, |
| CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)> |
| ]; |
| |
| let extraClassDeclaration = [{ |
| /// Returns `true` if the GPU function defined by this Op is a kernel, i.e. |
| /// it is intended to be launched from host. |
| bool isKernel() { |
| return (*this)->getAttrOfType<UnitAttr>( |
| GPUDialect::getKernelFuncAttrName()) != nullptr; |
| } |
| |
| /// Returns the number of buffers located in the workgroup memory. |
| unsigned getNumWorkgroupAttributions() { |
| return (*this)->getAttrOfType<IntegerAttr>( |
| getNumWorkgroupAttributionsAttrName()).getInt(); |
| } |
| |
| /// Returns a list of block arguments that correspond to buffers located in |
| /// the workgroup memory |
| ArrayRef<BlockArgument> getWorkgroupAttributions() { |
| auto begin = |
| std::next(getBody().args_begin(), getType().getNumInputs()); |
| auto end = std::next(begin, getNumWorkgroupAttributions()); |
| return {begin, end}; |
| } |
| |
| /// Adds a new block argument that corresponds to buffers located in |
| /// workgroup memory. |
| BlockArgument addWorkgroupAttribution(Type type); |
| |
| /// Returns the number of buffers located in the private memory. |
| unsigned getNumPrivateAttributions() { |
| return getBody().getNumArguments() - getType().getNumInputs() - |
| getNumWorkgroupAttributions(); |
| } |
| |
| /// Returns a list of block arguments that correspond to buffers located in |
| /// the private memory. |
| ArrayRef<BlockArgument> getPrivateAttributions() { |
| // Buffers on the private memory always come after buffers on the workgroup |
| // memory. |
| auto begin = |
| std::next(getBody().args_begin(), |
| getType().getNumInputs() + getNumWorkgroupAttributions()); |
| return {begin, getBody().args_end()}; |
| } |
| |
| /// Adds a new block argument that corresponds to buffers located in |
| /// private memory. |
| BlockArgument addPrivateAttribution(Type type); |
| |
| /// Returns the name of the attribute containing the number of buffers |
| /// located in the workgroup memory. |
| static StringRef getNumWorkgroupAttributionsAttrName() { |
| return "workgroup_attributions"; |
| } |
| |
| // FunctionLike trait needs access to the functions below. |
| friend class OpTrait::FunctionLike<GPUFuncOp>; |
| |
| /// Hooks for the input/output type enumeration in FunctionLike . |
| unsigned getNumFuncArguments() { return getType().getNumInputs(); } |
| unsigned getNumFuncResults() { return getType().getNumResults(); } |
| |
| /// Returns the keywords used in the custom syntax for this Op. |
| static StringRef getWorkgroupKeyword() { return "workgroup"; } |
| static StringRef getPrivateKeyword() { return "private"; } |
| static StringRef getKernelKeyword() { return "kernel"; } |
| |
| /// Hook for FunctionLike verifier. |
| LogicalResult verifyType(); |
| |
| /// Verifies the body of the function. |
| LogicalResult verifyBody(); |
| }]; |
| |
| // let verifier = [{ return ::verifFuncOpy(*this); }]; |
| let printer = [{ printGPUFuncOp(p, *this); }]; |
| let parser = [{ return parseGPUFuncOp(parser, result); }]; |
| } |
| |
| def GPU_LaunchFuncOp : GPU_Op<"launch_func", |
| [GPU_AsyncOpInterface, AttrSizedOperandSegments]>, |
| Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies, |
| SymbolRefAttr:$kernel, |
| Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, |
| Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, |
| Optional<I32>:$dynamicSharedMemorySize, |
| Variadic<AnyType>:$operands)>, |
| Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> { |
| let summary = "Launches a function as a GPU kernel"; |
| |
| let description = [{ |
| Launch a kernel function on the specified grid of thread blocks. |
| `gpu.launch` operations are lowered to `gpu.launch_func` operations by |
| outlining the kernel body into a function in a dedicated module, which |
| reflects the separate compilation process. The kernel function is required |
| to have the `gpu.kernel` attribute. The module containing the kernel |
| function is required to be a gpu.module. And finally, the module containing |
| the kernel module (which thus cannot be the top-level module) is required |
| to have the `gpu.container_module` attribute. The `gpu.launch_func` |
| operation has a symbol attribute named `kernel` to identify the fully |
| specified kernel function to launch (both the gpu.module and func). |
| |
| The `gpu.launch_func` supports async dependencies: the kernel does not start |
| executing until the ops producing those async dependencies have completed. |
| |
| By the default, the host implicitly blocks until kernel execution has |
| completed. If the `async` keyword is present, the host does not block but |
| instead a `!gpu.async.token` is returned. Other async GPU ops can take this |
| token as dependency. |
| |
| The operation requires at least the grid and block sizes along the x,y,z |
| dimensions as arguments. When a lower-dimensional kernel is required, |
| unused sizes must be explicitly set to `1`. |
| |
| The remaining operands are optional. The first optional operand corresponds |
| to the amount of dynamic shared memory a kernel's workgroup should be |
| allocated; when this operand is not present, a zero size is assumed. |
| |
| The remaining operands if present are passed as arguments to the kernel |
| function. |
| |
| Example: |
| |
| ```mlir |
| module attributes {gpu.container_module} { |
| |
| // This module creates a separate compilation unit for the GPU compiler. |
| gpu.module @kernels { |
| func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) |
| attributes { nvvm.kernel = true } { |
| |
| // Operations that produce block/thread IDs and dimensions are |
| // injected when outlining the `gpu.launch` body to a function called |
| // by `gpu.launch_func`. |
| %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) |
| %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index) |
| %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index) |
| |
| %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index) |
| %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index) |
| %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index) |
| |
| %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index) |
| %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index) |
| %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index) |
| |
| %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index) |
| %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index) |
| %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) |
| |
| "some_op"(%bx, %tx) : (index, index) -> () |
| %42 = load %arg1[%bx] : memref<?xf32, 1> |
| } |
| } |
| |
| %t0 = gpu.wait async |
| gpu.launch_func |
| async // (Optional) Don't block host, return token. |
| [%t0] // (Optional) Execute only after %t0 has completed. |
| @kernels::@kernel_1 // Kernel function. |
| blocks in (%cst, %cst, %cst) // Grid size. |
| threads in (%cst, %cst, %cst) // Block size. |
| dynamic_shared_memory_size %s // (Optional) Amount of dynamic shared |
| // memory to allocate for a workgroup. |
| args(%arg0 : f32, // (Optional) Kernel arguments. |
| %arg1 : memref<?xf32, 1>) |
| } |
| ``` |
| }]; |
| |
| let skipDefaultBuilders = 1; |
| |
| let builders = [ |
| OpBuilder<(ins "GPUFuncOp":$kernelFunc, "KernelDim3":$gridSize, |
| "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize, |
| "ValueRange":$kernelOperands)> |
| ]; |
| |
| let extraClassDeclaration = [{ |
| /// The number of operands passed to the kernel function. |
| unsigned getNumKernelOperands(); |
| |
| /// The name of the kernel's containing module. |
| StringAttr getKernelModuleName(); |
| |
| /// The name of the kernel. |
| StringAttr getKernelName(); |
| |
| /// The i-th operand passed to the kernel function. |
| Value getKernelOperand(unsigned i); |
| |
| /// Get the SSA values passed as operands to specify the grid size. |
| KernelDim3 getGridSizeOperandValues(); |
| |
| /// Get the SSA values passed as operands to specify the block size. |
| KernelDim3 getBlockSizeOperandValues(); |
| |
| /// The number of launch configuration operands, placed at the leading |
| /// positions of the operand list. |
| static constexpr unsigned kNumConfigOperands = 6; |
| |
| // This needs to quietly verify if attributes with names defined below are |
| // present since it is run before the verifier of this op. |
| friend LogicalResult GPUDialect::verifyOperationAttribute(Operation *, |
| NamedAttribute); |
| |
| /// The name of the symbol reference attribute specifying the kernel to launch. |
| static StringRef getKernelAttrName() { return "kernel"; } |
| }]; |
| |
| let verifier = [{ return ::verify(*this); }]; |
| let assemblyFormat = [{ |
| custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) |
| $kernel |
| `blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)` |
| `threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)` |
| (`dynamic_shared_memory_size` $dynamicSharedMemorySize^)? |
| custom<LaunchFuncOperands>($operands, type($operands)) attr-dict |
| }]; |
| } |
| |
| def GPU_LaunchOp : GPU_Op<"launch">, |
| Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, |
| Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, |
| Optional<I32>:$dynamicSharedMemorySize)>, |
| Results<(outs)> { |
| let summary = "GPU kernel launch operation"; |
| |
| let description = [{ |
| Launch a kernel on the specified grid of thread blocks. The body of the |
| kernel is defined by the single region that this operation contains. The |
| operation takes six operands followed by an optional operand: the first |
| three operands are grid sizes along the x,y,z dimensions and the following |
| three are block sizes along the x,y,z dimensions. The last operand is |
| optional and corresponds to the amount of dynamic shared memory a kernel's |
| workgroup should be allocated; when this operand is not present, a zero size |
| is assumed. |
| |
| When a lower-dimensional kernel is required, unused sizes must |
| be explicitly set to `1`. |
| |
| The body region has _twelve_ arguments, grouped as follows: |
| |
| - three arguments that contain block identifiers along x,y,z dimensions; |
| - three arguments that contain thread identifiers along x,y,z dimensions; |
| - operands of the `gpu.launch` operation as is (i.e. the operands for |
| grid and block sizes). |
| |
| Syntax: |
| |
| ``` |
| operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment |
| `threads` `(` ssa-id-list `)` `in` ssa-reassignment |
| (dynamic_shared_memory_size ssa-use)? |
| region attr-dict? |
| ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` |
| ``` |
| |
| Example: |
| |
| ```mlir |
| gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2) |
| threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) { |
| // Block and thread identifiers, as well as block/grid sizes are |
| // immediately usable inside body region. |
| "some_op"(%bx, %tx) : (index, index) -> () |
| // Assuming %val1 is defined outside the gpu.launch region. |
| %42 = load %val1[%bx] : memref<?xf32, 1> |
| } |
| |
| // Generic syntax explains how the pretty syntax maps to the IR structure. |
| "gpu.launch"(%cst, %cst, %c1, // Grid sizes. |
| %cst, %c1, %c1) // Block sizes. |
| |
| {/*attributes*/} |
| // All sizes and identifiers have "index" size. |
| : (index, index, index, index, index, index) -> () { |
| // The operation passes block and thread identifiers, followed by grid and |
| // block sizes. |
| ^bb0(%bx : index, %by : index, %bz : index, |
| %tx : index, %ty : index, %tz : index, |
| %num_bx : index, %num_by : index, %num_bz : index, |
| %num_tx : index, %num_ty : index, %num_tz : index) |
| "some_op"(%bx, %tx) : (index, index) -> () |
| %3 = "memref.load"(%val1, %bx) : (memref<?xf32, 1>, index) -> f32 |
| } |
| ``` |
| |
| Rationale: using operation/block arguments gives analyses a clear way of |
| understanding that a value has additional semantics (e.g., we will need to |
| know what value corresponds to threadIdx.x for coalescing). We can recover |
| these properties by analyzing the operations producing values, but it is |
| easier just to have that information by construction. |
| }]; |
| |
| let regions = (region AnyRegion:$body); |
| |
| let skipDefaultBuilders = 1; |
| |
| let builders = [ |
| OpBuilder<(ins "Value":$gridSizeX, "Value":$gridSizeY, |
| "Value":$gridSizeZ, "Value":$blockSizeX, "Value":$blockSizeY, |
| "Value":$blockSizeZ, |
| CArg<"Value", "nullptr">:$dynamic_shared_memory_size)> |
| ]; |
| |
| let extraClassDeclaration = [{ |
| /// Get the SSA values corresponding to kernel block identifiers. |
| KernelDim3 getBlockIds(); |
| /// Get the SSA values corresponding to kernel thread identifiers. |
| KernelDim3 getThreadIds(); |
| /// Get the SSA values corresponding to kernel grid size. |
| KernelDim3 getGridSize(); |
| /// Get the SSA values corresponding to kernel block size. |
| KernelDim3 getBlockSize(); |
| |
| /// Get the SSA values passed as operands to specify the grid size. |
| KernelDim3 getGridSizeOperandValues(); |
| /// Get the SSA values passed as operands to specify the block size. |
| KernelDim3 getBlockSizeOperandValues(); |
| |
| static StringRef getBlocksKeyword() { return "blocks"; } |
| static StringRef getThreadsKeyword() { return "threads"; } |
| static StringRef getDynamicSharedMemorySizeKeyword() { |
| return "dynamic_shared_memory_size"; |
| } |
| |
| /// The number of launch configuration operands, placed at the leading |
| /// positions of the operand list. |
| static constexpr unsigned kNumConfigOperands = 6; |
| |
| /// The number of region attributes containing the launch configuration, |
| /// placed in the leading positions of the argument list. |
| static constexpr unsigned kNumConfigRegionAttributes = 12; |
| }]; |
| |
| let parser = [{ return parseLaunchOp(parser, result); }]; |
| let printer = [{ printLaunchOp(p, *this); }]; |
| let verifier = [{ return ::verify(*this); }]; |
| let hasCanonicalizer = 1; |
| } |
| |
| def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, NoSideEffect, |
| Terminator]>, |
| Arguments<(ins Variadic<AnyType>:$operands)>, Results<(outs)> { |
| let summary = "Terminator for GPU functions."; |
| let description = [{ |
| A terminator operation for regions that appear in the body of `gpu.func` |
| functions. The operands to the `gpu.return` are the result values returned |
| by an invocation of the `gpu.func`. |
| }]; |
| |
| let builders = [OpBuilder<(ins), [{ // empty}]>]; |
| |
| let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?"; |
| let verifier = [{ return ::verify(*this); }]; |
| } |
| |
| def GPU_TerminatorOp : GPU_Op<"terminator", [HasParent<"LaunchOp">, |
| NoSideEffect, Terminator]>, |
| Arguments<(ins)>, Results<(outs)> { |
| let summary = "Terminator for GPU launch regions."; |
| let description = [{ |
| A terminator operation for regions that appear in the body of `gpu.launch` |
| operation. These regions are not expected to return any value so the |
| terminator takes no operands. |
| }]; |
| |
| let assemblyFormat = "attr-dict"; |
| } |
| |
| def GPU_YieldOp : GPU_Op<"yield", [NoSideEffect, Terminator]>, |
| Arguments<(ins Variadic<AnyType>:$values)> { |
| let summary = "GPU yield operation"; |
| let description = [{ |
| gpu.yield` is a special terminator operation for blocks inside regions |
| in gpu ops. It returns values to the immediately enclosing gpu op. |
| |
| Example: |
| |
| ```mlir |
| gpu.yield %f0, %f1 : f32, f32 |
| ``` |
| }]; |
| } |
| |
| // add, mul mirror the XLA ComparisonDirection enum. |
| def GPU_AllReduceOpAdd : StrEnumAttrCase<"ADD", -1, "add">; |
| def GPU_AllReduceOpAnd : StrEnumAttrCase<"AND", -1, "and">; |
| def GPU_AllReduceOpMax : StrEnumAttrCase<"MAX", -1, "max">; |
| def GPU_AllReduceOpMin : StrEnumAttrCase<"MIN", -1, "min">; |
| def GPU_AllReduceOpMul : StrEnumAttrCase<"MUL", -1, "mul">; |
| def GPU_AllReduceOpOr : StrEnumAttrCase<"OR", -1, "or">; |
| def GPU_AllReduceOpXor : StrEnumAttrCase<"XOR", -1, "xor">; |
| |
| def GPU_AllReduceOperationAttr : StrEnumAttr<"AllReduceOperationAttr", |
| "built-in reduction operations supported by gpu.allreduce.", |
| [ |
| GPU_AllReduceOpAdd, |
| GPU_AllReduceOpAnd, |
| GPU_AllReduceOpMax, |
| GPU_AllReduceOpMin, |
| GPU_AllReduceOpMul, |
| GPU_AllReduceOpOr, |
| GPU_AllReduceOpXor |
| ]>{ |
| let cppNamespace = "::mlir::gpu"; |
| } |
| |
| def GPU_AllReduceOp : GPU_Op<"all_reduce", |
| [SameOperandsAndResultType, IsolatedFromAbove]>, |
| Arguments<(ins AnyType:$value, |
| OptionalAttr<GPU_AllReduceOperationAttr>:$op)>, |
| Results<(outs AnyType)> { |
| let summary = "Reduce values among workgroup."; |
| let description = [{ |
| The `all_reduce` op reduces the value of every work item across a local |
| workgroup. The result is equal for all work items of a workgroup. |
| |
| For example, both |
| |
| ```mlir |
| %1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32) |
| %2 = "gpu.all_reduce"(%0) ({ |
| ^bb(%lhs : f32, %rhs : f32): |
| %sum = arith.addf %lhs, %rhs : f32 |
| "gpu.yield"(%sum) : (f32) -> () |
| }) : (f32) -> (f32) |
| ``` |
| |
| compute the sum of each work item's %0 value. The first version specifies |
| the accumulation as operation, whereas the second version specifies the |
| accumulation as code region. The accumulation operation must be one of: |
| `add`, `and`, `max`, `min`, `mul`, `or`, `xor`. |
| |
| Either none or all work items of a workgroup need to execute this op |
| in convergence. |
| }]; |
| let regions = (region AnyRegion:$body); |
| let verifier = [{ return ::verifyAllReduce(*this); }]; |
| } |
| |
| def GPU_ShuffleOpXor : StrEnumAttrCase<"XOR", -1, "xor">; |
| def GPU_ShuffleOpDown : StrEnumAttrCase<"DOWN", -1, "down">; |
| def GPU_ShuffleOpUp : StrEnumAttrCase<"UP", -1, "up">; |
| def GPU_ShuffleOpIdx : StrEnumAttrCase<"IDX", -1, "idx">; |
| |
| def GPU_ShuffleModeAttr : StrEnumAttr<"ShuffleModeAttr", |
| "Indexing modes supported by gpu.shuffle.", |
| [ |
| GPU_ShuffleOpXor, GPU_ShuffleOpUp, GPU_ShuffleOpDown, GPU_ShuffleOpIdx, |
| ]>{ |
| let cppNamespace = "::mlir::gpu"; |
| let storageType = "mlir::StringAttr"; |
| let returnType = "::mlir::gpu::ShuffleModeAttr"; |
| let convertFromStorage = |
| "*symbolizeEnum<::mlir::gpu::ShuffleModeAttr>($_self.getValue())"; |
| let constBuilderCall = "$_builder.getStringAttr(stringifyEnum($0))"; |
| } |
| |
| |
| def GPU_ShuffleOp : GPU_Op<"shuffle", [NoSideEffect]>, |
| Arguments<(ins AnyType:$value, I32:$offset, I32:$width, |
| GPU_ShuffleModeAttr:$mode)>, |
| Results<(outs AnyType:$result, I1:$valid)> { |
| let summary = "Shuffles values within a subgroup."; |
| let description = [{ |
| The "shuffle" op moves values to a different invocation within the same |
| subgroup. |
| |
| Example: |
| |
| ```mlir |
| %1, %2 = gpu.shuffle %0, %offset, %width xor : f32 |
| ``` |
| |
| For lane k returns the value from lane `k ^ offset` and `true` if that lane |
| is smaller than %width. Otherwise it returns an unspecified value and |
| `false`. A lane is the index of an invocation relative to its subgroup. |
| |
| The width specifies the number of invocations that participate in the |
| shuffle. The width needs to be the same for all invocations that participate |
| in the shuffle. Exactly the first `width` invocations of a subgroup need to |
| execute this op in convergence. |
| }]; |
| let verifier = [{ return ::verifyShuffleOp(*this); }]; |
| let printer = [{ printShuffleOp(p, *this); }]; |
| let parser = [{ return parseShuffleOp(parser, result); }]; |
| } |
| |
| def GPU_BarrierOp : GPU_Op<"barrier"> { |
| let summary = "Synchronizes all work items of a workgroup."; |
| let description = [{ |
| The "barrier" op synchronizes all work items of a workgroup. It is used |
| to coordinate communication between the work items of the workgroup. |
| |
| ```mlir |
| gpu.barrier |
| ``` |
| |
| waits until all work items in the workgroup have reached this point |
| and all memory accesses made by these work items prior to the op are |
| visible to all work items in the workgroup. Data hazards between work items |
| accessing the same memory can be avoided by synchronizing work items |
| in-between these accesses. |
| |
| Either none or all work items of a workgroup need to execute this op |
| in convergence. |
| }]; |
| let assemblyFormat = "attr-dict"; |
| } |
| |
| def GPU_GPUModuleOp : GPU_Op<"module", [ |
| DataLayoutOpInterface, HasDefaultDLTIDataLayout, IsolatedFromAbove, |
| SymbolTable, Symbol, |
| SingleBlockImplicitTerminator<"ModuleEndOp"> |
| ]> { |
| let summary = "A top level compilation unit containing code to be run on a GPU."; |
| let description = [{ |
| GPU module contains code that is intended to be run on a GPU. A host device |
| can launch this code through a gpu.launc_func that creates a fully |
| qualified symbol through the gpu.module's symbol and a gpu.func symbol |
| contained in the gpu.module. |
| |
| The module's top-level scope is modeled by a single region with a single |
| block. GPU modules are required to have a name that is used for symbol |
| resolution by the gpu.launch_func operation. |
| |
| Using an op with a region to define a GPU module enables "embedding" GPU |
| modules with SIMT execution models in other dialects in a clean manner and |
| allows filtering of code regions to execute passes on only code intended to |
| or not intended to be run on the separate device. |
| |
| ``` |
| gpu.module @symbol_name { |
| gpu.func {} |
| ... |
| gpu.module_end |
| } |
| |
| ``` |
| }]; |
| let builders = [OpBuilder<(ins "StringRef":$name)>]; |
| let parser = [{ return ::parseGPUModuleOp(parser, result); }]; |
| let printer = [{ return ::print(p, *this); }]; |
| let regions = (region SizedRegion<1>:$body); |
| |
| // We need to ensure the block inside the region is properly terminated; |
| // the auto-generated builders do not guarantee that. |
| let skipDefaultBuilders = 1; |
| } |
| |
| def GPU_ModuleEndOp : GPU_Op<"module_end", [ |
| Terminator, HasParent<"GPUModuleOp"> |
| ]> { |
| let summary = "A pseudo op that marks the end of a gpu.module."; |
| let description = [{ |
| This op terminates the only block inside the only region of a `gpu.module`. |
| }]; |
| |
| let assemblyFormat = "attr-dict"; |
| } |
| |
| def GPU_HostRegisterOp : GPU_Op<"host_register">, |
| Arguments<(ins AnyUnrankedMemRef:$value)> { |
| let summary = "Registers a memref for access from device."; |
| let description = [{ |
| This op maps the provided host buffer into the device address space. |
| |
| This operation may not be supported in every environment, there is not yet a |
| way to check at runtime whether this feature is supported. |
| |
| Writes from the host are guaranteed to be visible to device kernels that are |
| launched afterwards. Writes from the device are guaranteed to be visible on |
| the host after synchronizing with the device kernel completion. |
| }]; |
| |
| let assemblyFormat = "$value attr-dict `:` type($value)"; |
| let verifier = [{ return success(); }]; |
| } |
| |
| def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> { |
| let summary = "Wait for async gpu ops to complete."; |
| let description = [{ |
| This op synchronizes the host or the device with a list of dependent ops. |
| |
| If the op contains the `async` keyword, it returns a new async token which |
| is synchronized with the op arguments. This new token is merely a shortcut |
| to the argument list, and one could replace the uses of the result with the |
| arguments for the same effect. The async version of this op is primarily |
| used to make each async token have a single use during lowering and |
| thereby make forks in async execution explicit. Example usage: |
| |
| ```mlir |
| %t0 = gpu.foo async : !gpu.async.token |
| %t1 = gpu.bar async : !gpu.async.token |
| %t2 = gpu.wait async [%t0, %t1] |
| // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just |
| // as if the async dependencies were [%t0, %t1]. |
| %t3 = gpu.baz async [%t2] |
| ``` |
| |
| If the op does not contain the `async` keyword, it does not return a new |
| async token but blocks until all ops producing the async dependency tokens |
| finished execution. All dependent memory operations are visible to the host |
| once this op completes. Example usage: |
| |
| ```mlir |
| %t0 = gpu.foo async : !gpu.async.token |
| %t1 = gpu.bar async : !gpu.async.token |
| // The gpu.wait op blocks until gpu.foo and gpu.bar have completed. |
| gpu.wait [%t0, %t1] |
| ``` |
| }]; |
| |
| let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies); |
| let results = (outs Optional<GPU_AsyncToken>:$asyncToken); |
| |
| let assemblyFormat = [{ |
| custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict |
| }]; |
| } |
| |
| def GPU_AllocOp : GPU_Op<"alloc", [ |
| GPU_AsyncOpInterface, |
| AttrSizedOperandSegments |
| ]> { |
| |
| let summary = "GPU memory allocation operation."; |
| let description = [{ |
| The `gpu.alloc` operation allocates a region of memory on the GPU. It is |
| similar to the `memref.alloc` op, but supports asynchronous GPU execution. |
| |
| The op does not execute before all async dependencies have finished |
| executing. |
| |
| If the `async` keyword is present, the op is executed asynchronously (i.e. |
| it does not block until the execution has finished on the device). In |
| that case, it also returns a !gpu.async.token. |
| |
| Example: |
| |
| ```mlir |
| %memref, %token = gpu.alloc async [%dep] (%width) : memref<64x?xf32, 1> |
| ``` |
| }]; |
| |
| let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, |
| Variadic<Index>:$dynamicSizes, Variadic<Index>:$symbolOperands); |
| let results = (outs Res<AnyMemRef, "", [MemAlloc]>:$memref, |
| Optional<GPU_AsyncToken>:$asyncToken); |
| |
| let extraClassDeclaration = [{ |
| MemRefType getType() { return memref().getType().cast<MemRefType>(); } |
| }]; |
| |
| let assemblyFormat = [{ |
| custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) ` ` |
| `(` $dynamicSizes `)` (`` `[` $symbolOperands^ `]`)? attr-dict `:` type($memref) |
| }]; |
| |
| let hasCanonicalizer = 1; |
| } |
| |
| def GPU_DeallocOp : GPU_Op<"dealloc", [GPU_AsyncOpInterface]> { |
| |
| let summary = "GPU memory deallocation operation"; |
| |
| let description = [{ |
| The `gpu.dealloc` operation frees the region of memory referenced by a |
| memref which was originally created by the `gpu.alloc` operation. It is |
| similar to the `memref.dealloc` op, but supports asynchronous GPU execution. |
| |
| The op does not execute before all async dependencies have finished |
| executing. |
| |
| If the `async` keyword is present, the op is executed asynchronously (i.e. |
| it does not block until the execution has finished on the device). In |
| that case, it returns a !gpu.async.token. |
| |
| Example: |
| |
| ```mlir |
| %token = gpu.dealloc async [%dep] %memref : memref<8x64xf32, 1> |
| ``` |
| }]; |
| |
| let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, |
| Arg<AnyMemRef, "", [MemFree]>:$memref); |
| let results = (outs Optional<GPU_AsyncToken>:$asyncToken); |
| |
| let assemblyFormat = [{ |
| custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) |
| $memref attr-dict `:` type($memref) |
| }]; |
| } |
| |
| def GPU_MemcpyOp : GPU_Op<"memcpy", [GPU_AsyncOpInterface]> { |
| |
| let summary = "GPU memcpy operation"; |
| |
| let description = [{ |
| The `gpu.memcpy` operation copies the content of one memref to another. |
| |
| The op does not execute before all async dependencies have finished |
| executing. |
| |
| If the `async` keyword is present, the op is executed asynchronously (i.e. |
| it does not block until the execution has finished on the device). In |
| that case, it returns a !gpu.async.token. |
| |
| Example: |
| |
| ```mlir |
| %token = gpu.memcpy async [%dep] %dst, %src : memref<?xf32, 1>, memref<?xf32> |
| ``` |
| }]; |
| |
| let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, |
| Arg<AnyMemRef, "", [MemWrite]>:$dst, |
| Arg<AnyMemRef, "", [MemRead]>:$src); |
| let results = (outs Optional<GPU_AsyncToken>:$asyncToken); |
| |
| let assemblyFormat = [{ |
| custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) |
| $dst`,` $src `:` type($dst)`,` type($src) attr-dict |
| }]; |
| let verifier = [{ return ::verify(*this); }]; |
| let hasFolder = 1; |
| } |
| |
| def GPU_MemsetOp : GPU_Op<"memset", |
| [GPU_AsyncOpInterface, AllElementTypesMatch<["dst", "value"]>]> { |
| |
| let summary = "GPU memset operation"; |
| |
| let description = [{ |
| The `gpu.memset` operation sets the content of memref to a scalar value. |
| |
| The op does not execute before all async dependencies have finished |
| executing. |
| |
| If the `async` keyword is present, the op is executed asynchronously (i.e. |
| it does not block until the execution has finished on the device). In |
| that case, it returns a !gpu.async.token. |
| |
| Example: |
| |
| ```mlir |
| %token = gpu.memset async [%dep] %dst, %value : memref<?xf32, 1>, f32 |
| ``` |
| }]; |
| |
| let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, |
| Arg<AnyMemRef, "", [MemWrite]>:$dst, |
| Arg<AnyType, "">:$value); |
| let results = (outs Optional<GPU_AsyncToken>:$asyncToken); |
| |
| let assemblyFormat = [{ |
| custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) |
| $dst`,` $value `:` type($dst)`,` type($value) attr-dict |
| }]; |
| // MemsetOp is fully verified by traits. |
| let verifier = [{ return success(); }]; |
| let hasFolder = 1; |
| } |
| |
| def GPU_SubgroupMmaLoadMatrixOp : GPU_Op<"subgroup_mma_load_matrix", |
| [MemoryEffects<[MemRead]>]>{ |
| |
| let summary = "GPU warp synchronous matrix load"; |
| |
| let description = [{ |
| The `gpu.subgroup_mma_load_matrix` operation loads a matrix collectively |
| using all the threads in a subgroup. |
| |
| This operation takes a memref as its first operand: it is the source matrix |
| from which data is to be loaded. The op returns a `!gpu.mma_matrix`. The |
| source memref can be in global memory or shared memory. The load address is |
| determined using `indices`. The matrix being loaded into is the result. The |
| `leadDimension` attribute specifies the leading dimension size of the source |
| matrix which eventually allows the lowering to determine the size of each |
| row. |
| |
| This op is often meant to be used along with `gpu.subgroup_mma_store_matrix` and |
| `gpu.subgroup_mma_compute`. |
| |
| Example: |
| |
| ```mlir |
| %0 = gpu.subgroup_mma_load_matrix src[%i,%j] : {leadDimension = 32 : i32} |
| : memref<32x32xf16, 3>, !gpu.mma_matrix<16x16xf16, "AOp"> |
| ``` |
| }]; |
| |
| let arguments = (ins Arg<MemRefOf<[F16, F32]>, "", [MemRead]>:$srcMemref, |
| Variadic<Index>:$indices, |
| IndexAttr:$leadDimension); |
| |
| let results = (outs GPU_MMAMatrix:$res); |
| |
| let assemblyFormat = [{ |
| $srcMemref`[`$indices`]` attr-dict `:` type($srcMemref) `->` type($res) |
| }]; |
| |
| let verifier = [{ return ::verify(*this); }]; |
| } |
| |
| def GPU_SubgroupMmaStoreMatrixOp : GPU_Op<"subgroup_mma_store_matrix", |
| [MemoryEffects<[MemWrite]>]>{ |
| |
| let summary = "GPU warp synchronous matrix store"; |
| |
| let description = [{ |
| The `gpu.subgroup_mma_store_matrix` operation stores a matrix collectively |
| using all the threads in a subgroup. |
| |
| This operation takes a `!gpu.mma_matrix` and a memref as operands. |
| `!gpu.mma_matrix` is the source value containing the data to be stored into the |
| destination memref which can be in global or shared memory. The store address |
| is determined using the indices provided. The `leadDimension` attribute |
| specifies the leading dimension of the destination matrix. |
| |
| This op is often meant to be used along with `gpu.subgroup_mma_load_matrix` and |
| `gpu.subgroup_mma_compute`. |
| |
| Example: |
| |
| ```mlir |
| gpu.subgroup_mma_store_matrix %D, %sg[%i,%j] : { leadDimension = 32 : i32} |
| : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3> |
| ``` |
| }]; |
| |
| let arguments = (ins Arg<MMAMatrixOf<[F16, F32]>>:$src, |
| Arg<MemRefOf<[F16, F32]>, "",[MemWrite]>:$dstMemref, |
| Variadic<Index>:$indices, |
| IndexAttr:$leadDimension); |
| |
| let assemblyFormat = [{ |
| $src`,` $dstMemref`[`$indices`]` attr-dict `:` type($src)`,` type($dstMemref) |
| }]; |
| |
| let verifier = [{ return ::verify(*this); }]; |
| } |
| |
| def GPU_SubgroupMmaComputeOp : GPU_Op<"subgroup_mma_compute", |
| [NoSideEffect, AllTypesMatch<["opC", "res"]>]>{ |
| |
| let summary = "GPU warp synchronous matrix multiply accumulate"; |
| |
| let description = [{ |
| The `gpu.subgroup_mma_compute` operation performs a matrix-multiply accumulate (mma) |
| operation using all the threads in a subgroup. |
| |
| This operation takes three `!gpu.mma_matrix`s as arguments: these hold `A`, |
| `B` and `C`operands for the mma operation. The operation performed is represented |
| as `C += A * B`. The op returns a `!gpu.mma_matrix` which contains the result of |
| the operation held by all threads in a subgroup. |
| |
| This op is meant to be used along with `gpu.subgroup_mma_store_matrix` and |
| `gpu.subgroup_mma_load_matrix` ops. |
| |
| Example: |
| |
| ```mlir |
| %D = gpu.subgroup_mma_compute_matrix %A, %B, %C : |
| !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp">> |
| -> !gpu.mma_matrix<16x16xf16, "COp"> |
| ``` |
| }]; |
| |
| let arguments = (ins Arg<MMAMatrixOf<[F16, F32]>>:$opA, |
| Arg<MMAMatrixOf<[F16, F32]>>:$opB, |
| Arg<MMAMatrixOf<[F16, F32]>>:$opC); |
| |
| let results = (outs GPU_MMAMatrix:$res); |
| |
| let assemblyFormat = [{ |
| $opA`,` $opB`,` $opC attr-dict `:` type($opA)`,` type($opB) `->` type($res) |
| }]; |
| |
| let verifier = [{ return ::verify(*this); }]; |
| } |
| |
| def GPU_SubgroupMmaConstantMatrixOp : GPU_Op<"subgroup_mma_constant_matrix", |
| [NoSideEffect, |
| TypesMatchWith<"value type matches element type of mma_matrix", |
| "res", "value", |
| "$_self.cast<gpu::MMAMatrixType>().getElementType()">]>{ |
| |
| let summary = "GPU warp synchronous constant matrix"; |
| |
| let description = [{ |
| The `gpu.subgroup_mma_constant_matrix` creates a `!gpu.mma_matrix` with |
| constant elements. |
| |
| The operation takes a scalar input and return a `!gpu.mma_matrix` where each |
| element of is equal to the operand constant. The destination mma_matrix type |
| must have elememt type equal to the constant type. Since the layout of |
| `!gpu.mma_matrix` is opaque this only support setting all the elements to |
| the same value. |
| |
| This op is meant to be used along with `gpu.subgroup_mma_compute`. |
| |
| Example: |
| |
| ```mlir |
| %0 = gpu.subgroup_mma_constant_matrix %a : |
| !gpu.mma_matrix<16x16xf16, "AOp"> |
| %1 = gpu.subgroup_mma_constant_matrix %b : |
| !gpu.mma_matrix<16x16xf32, "COp"> |
| ``` |
| }]; |
| |
| let arguments = (ins AnyTypeOf<[F16, F32]>:$value); |
| |
| let results = (outs GPU_MMAMatrix:$res); |
| |
| let extraClassDeclaration = [{ |
| gpu::MMAMatrixType getType() { |
| return res().getType().cast<gpu::MMAMatrixType>(); |
| } |
| }]; |
| |
| let assemblyFormat = [{ |
| $value attr-dict `:` type($res) |
| }]; |
| } |
| |
| def GPU_ELEMENTWISE_OP_ADD : StrEnumAttrCase<"ADDF">; |
| def GPU_ELEMENTWISE_OP_MUL : StrEnumAttrCase<"MULF">; |
| def GPU_ELEMENTWISE_OP_MAXF : StrEnumAttrCase<"MAXF">; |
| def GPU_ELEMENTWISE_OP_MINF : StrEnumAttrCase<"MINF">; |
| def GPU_ELEMENTWISE_OP_DIVF : StrEnumAttrCase<"DIVF">; |
| |
| def MMAElementWiseAttr : StrEnumAttr<"MMAElementwiseOp", |
| "elementwise operation to apply to mma matrix", |
| [GPU_ELEMENTWISE_OP_ADD, GPU_ELEMENTWISE_OP_MUL, |
| GPU_ELEMENTWISE_OP_MAXF, GPU_ELEMENTWISE_OP_MINF, GPU_ELEMENTWISE_OP_DIVF]> { |
| let cppNamespace = "::mlir::gpu"; |
| let storageType = "::mlir::StringAttr"; |
| let returnType = "::mlir::gpu::MMAElementwiseOp"; |
| let convertFromStorage = "*symbolizeMMAElementwiseOp($_self.getValue())"; |
| let constBuilderCall = "$_builder.getStringAttr(stringifyEnum($0))"; |
| } |
| |
| def GPU_SubgroupMmaElementwiseOp : GPU_Op<"subgroup_mma_elementwise", |
| [NoSideEffect, |
| AllTypesMatch<["args"]>]>{ |
| |
| let summary = "GPU warp elementwise operation on a matrix"; |
| |
| let description = [{ |
| The `gpu.subgroup_mma_elementwise` takes `!gpu.mma_matrix` inputs and |
| compute a new `!gpu.mma_matrix` by applying an elementwise operation to each |
| element. |
| |
| Since the operation is elementwise and the matrix type must match, the |
| matrix elements are processed independently of the matrix layout. |
| |
| This op is meant to be used along with `gpu.subgroup_mma_compute`. |
| |
| Example: |
| |
| ```mlir |
| %0 = %A, %B { operation = "ADD" } : |
| (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) |
| -> !gpu.mma_matrix<16x16xf16, "COp"> |
| ``` |
| }]; |
| |
| let arguments = (ins Variadic<GPU_MMAMatrix>:$args, MMAElementWiseAttr:$operation); |
| |
| let results = (outs GPU_MMAMatrix:$res); |
| |
| let extraClassDeclaration = [{ |
| gpu::MMAMatrixType getType() { |
| return res().getType().cast<gpu::MMAMatrixType>(); |
| } |
| }]; |
| |
| let assemblyFormat = [{ |
| $args attr-dict `:` functional-type($args, $res) |
| }]; |
| } |
| |
| #endif // GPU_OPS |