[mlir][amdgpu] Add scaled_ext_packed{8,16} operations (#159830)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 8370d35..7184de9 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -112,6 +112,97 @@
}];
}
+def IsValidBlockSize: AttrConstraint<
+ CPred<"::llvm::is_contained({16, 32}, ::llvm::cast<::mlir::IntegerAttr>($_self).getInt())">,
+ "whose value is 16 or 32">;
+
+def AMDGPU_ScaledExtPacked816Op
+ : AMDGPU_Op<"scaled_ext_packed816", [Pure, AllShapesMatch<["source", "res"]>]>,
+ Arguments<(
+ ins AnyTypeOf<[FixedVectorOfShapeAndType<[8], F4E2M1FN>,
+ FixedVectorOfShapeAndType<[8], F8E4M3FN>,
+ FixedVectorOfShapeAndType<[8], F8E5M2>,
+ FixedVectorOfShapeAndType<[16], F6E2M3FN>,
+ FixedVectorOfShapeAndType<[16], F6E3M2FN>]>:$source,
+ FixedVectorOfShapeAndType<[4], F8E8M0FNU>:$scale,
+ ConfinedAttr<I32Attr, [IsValidBlockSize]>:$blockSize,
+ ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>:$firstScaleLane,
+ ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<2>]>:$firstScaleByte)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfShapeAndType<[8], F32>,
+ FixedVectorOfShapeAndType<[8], F16>,
+ FixedVectorOfShapeAndType<[8], BF16>,
+ FixedVectorOfShapeAndType<[16], F32>,
+ FixedVectorOfShapeAndType<[16], F16>,
+ FixedVectorOfShapeAndType<[16], BF16>]>:$res)> {
+
+ let summary = "Extend a vector of packed floating point values";
+
+ let description = [{
+ The scales applied to the input microfloats are stored in two bytes which
+ come from the `scales` input provided in a *half* of the wave identified
+ by `firstScaleLane`. The pair of bytes used is selected by
+ `firstScaleByte`. The 16 vectors in consecutive lanes starting from
+ `firstScaleLane` (which we'll call the scale vectors) will be used by both
+ halves of the wave (with lane L reading from L % 16'th scale vector), but
+ each half will use a different byte.
+
+ When the block size is 32, `firstScaleByte` can be either 0 or 2,
+ selecting halves of the scale vectors. Lanes 0-15 will read from
+ `firstScaleByte` and lanes 16-31 will read from `firstScaleByte` + 1.
+ For example:
+ ```mlir
+ // Input: 8-element vector of F8E4M3FN, converting to F32
+ // Lanes 0-15 read from byte 0, lanes 16-31 read from byte 1
+ %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
+ blockSize(32) firstScaleLane(0) firstScaleByte(0)
+ : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+
+ // Input: 16-element vector of F6E2M3FN, converting to F16
+ // Lanes 0-15 read from byte 2, lanes 16-31 read from byte 3
+ %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
+ blockSize(32) firstScaleLane(1) firstScaleByte(2)
+ : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+ ```
+
+ However, when the block size is 16, `firstScaleByte` can be 0 or 1.
+ Lanes 0-15 read from the `firstScaleByte`th element of the scale vectors,
+ while lanes 16-31 read from `firstScaleByte` + 2.
+ For example:
+ ```mlir
+ // Input: 8-element vector of F8E5M2, converting to BF16
+ // Lanes 0-15 read from byte 0, lanes 16-31 read from byte 2 (0+2)
+ %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
+ blockSize(16) firstScaleLane(0) firstScaleByte(0)
+ : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+
+ // Input: 16-element vector of F6E3M2FN, converting to F32
+ // Lanes 0-15 read from byte 1, lanes 16-31 read from byte 3 (1+2)
+ %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
+ blockSize(16) firstScaleLane(1) firstScaleByte(1)
+ : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+ ```
+
+ Note: the layout for the scales generally mirrors how the WMMA
+ instructions use for matix scales. These selection operands allows
+ one to choose portions of the matrix to convert.
+
+ Available on gfx1250+.
+ }];
+
+ let assemblyFormat = [{
+ attr-dict $source
+ `scale` `(` $scale `)`
+ `blockSize` `(` $blockSize `)`
+ `firstScaleLane` `(` $firstScaleLane`)`
+ `firstScaleByte` `(` $firstScaleByte `)`
+ `:` type($source) `,` type($scale) `->` type($res)
+ }];
+
+ let hasVerifier = 1;
+
+}
+
def AMDGPU_ScaledExtPackedOp
: AMDGPU_Op<"scaled_ext_packed", [Pure]>,
Arguments<(
@@ -860,7 +951,7 @@
based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the
types of the source and destination arguments.
- For information on the layouts of the input and output matrces (which are stored
+ For information on the layouts of the input and output matrices (which are stored
in `sourceA`, `sourceB`, `destC`, and `destD`), see the CDNA ISA documentation.
The `cbsz`, `abid`, and `blgp` parameters control how the lanes of the wave
diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td
index 6b4e3dd..8427ba5 100644
--- a/mlir/include/mlir/IR/CommonTypeConstraints.td
+++ b/mlir/include/mlir/IR/CommonTypeConstraints.td
@@ -623,6 +623,14 @@
VectorOfNonZeroRankOf<allowedTypes>.summary # VectorOfLength<allowedLengths>.summary,
"::mlir::VectorType">;
+class FixedVectorOfShapeAndType<list<int> shape, Type elType>: ShapedContainerType<
+ [elType],
+ And<[IsVectorOfShape<shape>, IsFixedVectorOfAnyRankTypePred]>,
+ "vector<" # !interleave(shape, "x") # "x" # elType # ">",
+ "::mlir::VectorType">,
+ BuildableType<"::mlir::VectorType::get({" # !interleave(shape, " ,") # "} , " # elType.builderCall # " );">;
+
+
// Any fixed-length vector where the number of elements is from the given
// `allowedLengths` list and the type is from the given `allowedTypes` list
class FixedVectorOfLengthAndType<list<int> allowedLengths,
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index c798adb..61166db 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -339,6 +339,25 @@
}
//===----------------------------------------------------------------------===//
+// ScaledExtPacked816Op
+//===----------------------------------------------------------------------===//
+LogicalResult ScaledExtPacked816Op::verify() {
+ int blockSize = getBlockSize();
+ assert((blockSize == 16 || blockSize == 32) && "invalid block size");
+ int firstScaleByte = getFirstScaleByte();
+ if (blockSize == 16 && !llvm::is_contained({0, 1}, firstScaleByte)) {
+ return emitOpError(
+ "blockSize of 16 can only have firstScaleByte be 0 or 1.");
+ }
+ if (blockSize == 32 && !llvm::is_contained({0, 2}, firstScaleByte)) {
+ return emitOpError(
+ "blockSize of 32 can only have firstScaleByte be 0 or 2.");
+ }
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
// WMMAOp
//===----------------------------------------------------------------------===//
LogicalResult WMMAOp::verify() {
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 66e7dd4..a8256b1 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -238,3 +238,27 @@
amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16, strided<[?]>, #gpu.address_space<workgroup>>
func.return
}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+ // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1.}}
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+ func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+ // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2.}}
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+ func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+ // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op failed to verify that all of {source, res} have same shape}}
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16>
+ func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 8f427e9..f9c6899 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -221,6 +221,61 @@
func.return %ret : vector<2xbf16>
}
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp4
+func.func @scaled_ext_packed816_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp8
+func.func @scaled_ext_packed816_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_bf8
+func.func @scaled_ext_packed816_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp6
+func.func @scaled_ext_packed816_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_bf16
+func.func @scaled_ext_packed816_bf16(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed816
+ %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {