blob: 0a36d3dd28f06caf175547290db20a1bfcdbac74 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,SDAG %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
; 0 = fp8
; 1 = bf8
; 2 = fp6
; 3 = bf6
; 4 = fp4
; --------------------------------------------------------------------
; Different format signatures
; --------------------------------------------------------------------
; fp8 x fp8
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
i32 1, i32 %scale0, i32 1, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
i32 2, i32 %scale0, i32 2, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
i32 3, i32 %scale0, i32 3, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 3, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
i32 3, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
i32 2, i32 %scale0, i32 3, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
i32 3, i32 %scale0, i32 2, i32 %scale1)
ret <16 x float> %result
}
; This should be optimized to avoid the scale
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v16
; GCN-NEXT: v_mov_b32_e32 v1, v17
; GCN-NEXT: v_mov_b32_e32 v2, v18
; GCN-NEXT: v_mov_b32_e32 v3, v19
; GCN-NEXT: v_mov_b32_e32 v4, v20
; GCN-NEXT: v_mov_b32_e32 v5, v21
; GCN-NEXT: v_mov_b32_e32 v6, v22
; GCN-NEXT: v_mov_b32_e32 v7, v23
; GCN-NEXT: v_mov_b32_e32 v8, v24
; GCN-NEXT: v_mov_b32_e32 v9, v25
; GCN-NEXT: v_mov_b32_e32 v10, v26
; GCN-NEXT: v_mov_b32_e32 v11, v27
; GCN-NEXT: v_mov_b32_e32 v12, v28
; GCN-NEXT: v_mov_b32_e32 v13, v29
; GCN-NEXT: v_mov_b32_e32 v14, v30
; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp8 x bf8
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:1
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:1
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v16
; GCN-NEXT: v_mov_b32_e32 v1, v17
; GCN-NEXT: v_mov_b32_e32 v2, v18
; GCN-NEXT: v_mov_b32_e32 v3, v19
; GCN-NEXT: v_mov_b32_e32 v4, v20
; GCN-NEXT: v_mov_b32_e32 v5, v21
; GCN-NEXT: v_mov_b32_e32 v6, v22
; GCN-NEXT: v_mov_b32_e32 v7, v23
; GCN-NEXT: v_mov_b32_e32 v8, v24
; GCN-NEXT: v_mov_b32_e32 v9, v25
; GCN-NEXT: v_mov_b32_e32 v10, v26
; GCN-NEXT: v_mov_b32_e32 v11, v27
; GCN-NEXT: v_mov_b32_e32 v12, v28
; GCN-NEXT: v_mov_b32_e32 v13, v29
; GCN-NEXT: v_mov_b32_e32 v14, v30
; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 1, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp8 x fp6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] blgp:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] blgp:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp8 x bf6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] blgp:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 3, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] blgp:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 3, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp8 x fp4
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] blgp:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27] blgp:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; bf8 x fp8
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:1
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:1
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v16
; GCN-NEXT: v_mov_b32_e32 v1, v17
; GCN-NEXT: v_mov_b32_e32 v2, v18
; GCN-NEXT: v_mov_b32_e32 v3, v19
; GCN-NEXT: v_mov_b32_e32 v4, v20
; GCN-NEXT: v_mov_b32_e32 v5, v21
; GCN-NEXT: v_mov_b32_e32 v6, v22
; GCN-NEXT: v_mov_b32_e32 v7, v23
; GCN-NEXT: v_mov_b32_e32 v8, v24
; GCN-NEXT: v_mov_b32_e32 v9, v25
; GCN-NEXT: v_mov_b32_e32 v10, v26
; GCN-NEXT: v_mov_b32_e32 v11, v27
; GCN-NEXT: v_mov_b32_e32 v12, v28
; GCN-NEXT: v_mov_b32_e32 v13, v29
; GCN-NEXT: v_mov_b32_e32 v14, v30
; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; bf8 x bf8
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:1 blgp:1
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:1 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v16
; GCN-NEXT: v_mov_b32_e32 v1, v17
; GCN-NEXT: v_mov_b32_e32 v2, v18
; GCN-NEXT: v_mov_b32_e32 v3, v19
; GCN-NEXT: v_mov_b32_e32 v4, v20
; GCN-NEXT: v_mov_b32_e32 v5, v21
; GCN-NEXT: v_mov_b32_e32 v6, v22
; GCN-NEXT: v_mov_b32_e32 v7, v23
; GCN-NEXT: v_mov_b32_e32 v8, v24
; GCN-NEXT: v_mov_b32_e32 v9, v25
; GCN-NEXT: v_mov_b32_e32 v10, v26
; GCN-NEXT: v_mov_b32_e32 v11, v27
; GCN-NEXT: v_mov_b32_e32 v12, v28
; GCN-NEXT: v_mov_b32_e32 v13, v29
; GCN-NEXT: v_mov_b32_e32 v14, v30
; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
i32 1, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; bf8 x fp6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] cbsz:1 blgp:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; bf8 x bf6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
i32 3, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] cbsz:1 blgp:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
i32 3, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; bf8 x fp4
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27] cbsz:1 blgp:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp6 x fp8
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp6 x bf8
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:2 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 1, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp6 x fp6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:2 blgp:2
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp6 x bf6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 3, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:2 blgp:3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 3, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; bf6 x fp8
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; bf6 x bf8
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:3 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
i32 1, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; bf6 x fp6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:3 blgp:2
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; bf6 x fp4
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v10
; GCN-NEXT: v_mov_b32_e32 v1, v11
; GCN-NEXT: v_mov_b32_e32 v2, v12
; GCN-NEXT: v_mov_b32_e32 v3, v13
; GCN-NEXT: v_mov_b32_e32 v4, v14
; GCN-NEXT: v_mov_b32_e32 v5, v15
; GCN-NEXT: v_mov_b32_e32 v6, v16
; GCN-NEXT: v_mov_b32_e32 v7, v17
; GCN-NEXT: v_mov_b32_e32 v8, v18
; GCN-NEXT: v_mov_b32_e32 v9, v19
; GCN-NEXT: v_mov_b32_e32 v10, v20
; GCN-NEXT: v_mov_b32_e32 v11, v21
; GCN-NEXT: v_mov_b32_e32 v12, v22
; GCN-NEXT: v_mov_b32_e32 v13, v23
; GCN-NEXT: v_mov_b32_e32 v14, v24
; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25] cbsz:3 blgp:4
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v10
; GCN-NEXT: v_mov_b32_e32 v1, v11
; GCN-NEXT: v_mov_b32_e32 v2, v12
; GCN-NEXT: v_mov_b32_e32 v3, v13
; GCN-NEXT: v_mov_b32_e32 v4, v14
; GCN-NEXT: v_mov_b32_e32 v5, v15
; GCN-NEXT: v_mov_b32_e32 v6, v16
; GCN-NEXT: v_mov_b32_e32 v7, v17
; GCN-NEXT: v_mov_b32_e32 v8, v18
; GCN-NEXT: v_mov_b32_e32 v9, v19
; GCN-NEXT: v_mov_b32_e32 v10, v20
; GCN-NEXT: v_mov_b32_e32 v11, v21
; GCN-NEXT: v_mov_b32_e32 v12, v22
; GCN-NEXT: v_mov_b32_e32 v13, v23
; GCN-NEXT: v_mov_b32_e32 v14, v24
; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; bf6 x bf6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
i32 3, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:3 blgp:3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
i32 3, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp6 x fp4
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v10
; GCN-NEXT: v_mov_b32_e32 v1, v11
; GCN-NEXT: v_mov_b32_e32 v2, v12
; GCN-NEXT: v_mov_b32_e32 v3, v13
; GCN-NEXT: v_mov_b32_e32 v4, v14
; GCN-NEXT: v_mov_b32_e32 v5, v15
; GCN-NEXT: v_mov_b32_e32 v6, v16
; GCN-NEXT: v_mov_b32_e32 v7, v17
; GCN-NEXT: v_mov_b32_e32 v8, v18
; GCN-NEXT: v_mov_b32_e32 v9, v19
; GCN-NEXT: v_mov_b32_e32 v10, v20
; GCN-NEXT: v_mov_b32_e32 v11, v21
; GCN-NEXT: v_mov_b32_e32 v12, v22
; GCN-NEXT: v_mov_b32_e32 v13, v23
; GCN-NEXT: v_mov_b32_e32 v14, v24
; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25] cbsz:2 blgp:4
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v10
; GCN-NEXT: v_mov_b32_e32 v1, v11
; GCN-NEXT: v_mov_b32_e32 v2, v12
; GCN-NEXT: v_mov_b32_e32 v3, v13
; GCN-NEXT: v_mov_b32_e32 v4, v14
; GCN-NEXT: v_mov_b32_e32 v5, v15
; GCN-NEXT: v_mov_b32_e32 v6, v16
; GCN-NEXT: v_mov_b32_e32 v7, v17
; GCN-NEXT: v_mov_b32_e32 v8, v18
; GCN-NEXT: v_mov_b32_e32 v9, v19
; GCN-NEXT: v_mov_b32_e32 v10, v20
; GCN-NEXT: v_mov_b32_e32 v11, v21
; GCN-NEXT: v_mov_b32_e32 v12, v22
; GCN-NEXT: v_mov_b32_e32 v13, v23
; GCN-NEXT: v_mov_b32_e32 v14, v24
; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp4 x fp8
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27] cbsz:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp4 x bf8
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27] cbsz:4 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v12
; GCN-NEXT: v_mov_b32_e32 v1, v13
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: v_mov_b32_e32 v4, v16
; GCN-NEXT: v_mov_b32_e32 v5, v17
; GCN-NEXT: v_mov_b32_e32 v6, v18
; GCN-NEXT: v_mov_b32_e32 v7, v19
; GCN-NEXT: v_mov_b32_e32 v8, v20
; GCN-NEXT: v_mov_b32_e32 v9, v21
; GCN-NEXT: v_mov_b32_e32 v10, v22
; GCN-NEXT: v_mov_b32_e32 v11, v23
; GCN-NEXT: v_mov_b32_e32 v12, v24
; GCN-NEXT: v_mov_b32_e32 v13, v25
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 1, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp4 x fp6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v10
; GCN-NEXT: v_mov_b32_e32 v1, v11
; GCN-NEXT: v_mov_b32_e32 v2, v12
; GCN-NEXT: v_mov_b32_e32 v3, v13
; GCN-NEXT: v_mov_b32_e32 v4, v14
; GCN-NEXT: v_mov_b32_e32 v5, v15
; GCN-NEXT: v_mov_b32_e32 v6, v16
; GCN-NEXT: v_mov_b32_e32 v7, v17
; GCN-NEXT: v_mov_b32_e32 v8, v18
; GCN-NEXT: v_mov_b32_e32 v9, v19
; GCN-NEXT: v_mov_b32_e32 v10, v20
; GCN-NEXT: v_mov_b32_e32 v11, v21
; GCN-NEXT: v_mov_b32_e32 v12, v22
; GCN-NEXT: v_mov_b32_e32 v13, v23
; GCN-NEXT: v_mov_b32_e32 v14, v24
; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25] cbsz:4 blgp:2
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v10
; GCN-NEXT: v_mov_b32_e32 v1, v11
; GCN-NEXT: v_mov_b32_e32 v2, v12
; GCN-NEXT: v_mov_b32_e32 v3, v13
; GCN-NEXT: v_mov_b32_e32 v4, v14
; GCN-NEXT: v_mov_b32_e32 v5, v15
; GCN-NEXT: v_mov_b32_e32 v6, v16
; GCN-NEXT: v_mov_b32_e32 v7, v17
; GCN-NEXT: v_mov_b32_e32 v8, v18
; GCN-NEXT: v_mov_b32_e32 v9, v19
; GCN-NEXT: v_mov_b32_e32 v10, v20
; GCN-NEXT: v_mov_b32_e32 v11, v21
; GCN-NEXT: v_mov_b32_e32 v12, v22
; GCN-NEXT: v_mov_b32_e32 v13, v23
; GCN-NEXT: v_mov_b32_e32 v14, v24
; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp4 x bf6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v10
; GCN-NEXT: v_mov_b32_e32 v1, v11
; GCN-NEXT: v_mov_b32_e32 v2, v12
; GCN-NEXT: v_mov_b32_e32 v3, v13
; GCN-NEXT: v_mov_b32_e32 v4, v14
; GCN-NEXT: v_mov_b32_e32 v5, v15
; GCN-NEXT: v_mov_b32_e32 v6, v16
; GCN-NEXT: v_mov_b32_e32 v7, v17
; GCN-NEXT: v_mov_b32_e32 v8, v18
; GCN-NEXT: v_mov_b32_e32 v9, v19
; GCN-NEXT: v_mov_b32_e32 v10, v20
; GCN-NEXT: v_mov_b32_e32 v11, v21
; GCN-NEXT: v_mov_b32_e32 v12, v22
; GCN-NEXT: v_mov_b32_e32 v13, v23
; GCN-NEXT: v_mov_b32_e32 v14, v24
; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 3, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25] cbsz:4 blgp:3
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v10
; GCN-NEXT: v_mov_b32_e32 v1, v11
; GCN-NEXT: v_mov_b32_e32 v2, v12
; GCN-NEXT: v_mov_b32_e32 v3, v13
; GCN-NEXT: v_mov_b32_e32 v4, v14
; GCN-NEXT: v_mov_b32_e32 v5, v15
; GCN-NEXT: v_mov_b32_e32 v6, v16
; GCN-NEXT: v_mov_b32_e32 v7, v17
; GCN-NEXT: v_mov_b32_e32 v8, v18
; GCN-NEXT: v_mov_b32_e32 v9, v19
; GCN-NEXT: v_mov_b32_e32 v10, v20
; GCN-NEXT: v_mov_b32_e32 v11, v21
; GCN-NEXT: v_mov_b32_e32 v12, v22
; GCN-NEXT: v_mov_b32_e32 v13, v23
; GCN-NEXT: v_mov_b32_e32 v14, v24
; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 3, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; fp4 x fp4
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[0:3], v[4:7], v[8:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v8
; GCN-NEXT: v_mov_b32_e32 v1, v9
; GCN-NEXT: v_mov_b32_e32 v2, v10
; GCN-NEXT: v_mov_b32_e32 v3, v11
; GCN-NEXT: v_mov_b32_e32 v4, v12
; GCN-NEXT: v_mov_b32_e32 v5, v13
; GCN-NEXT: v_mov_b32_e32 v6, v14
; GCN-NEXT: v_mov_b32_e32 v7, v15
; GCN-NEXT: v_mov_b32_e32 v8, v16
; GCN-NEXT: v_mov_b32_e32 v9, v17
; GCN-NEXT: v_mov_b32_e32 v10, v18
; GCN-NEXT: v_mov_b32_e32 v11, v19
; GCN-NEXT: v_mov_b32_e32 v12, v20
; GCN-NEXT: v_mov_b32_e32 v13, v21
; GCN-NEXT: v_mov_b32_e32 v14, v22
; GCN-NEXT: v_mov_b32_e32 v15, v23
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:4 blgp:4
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v8
; GCN-NEXT: v_mov_b32_e32 v1, v9
; GCN-NEXT: v_mov_b32_e32 v2, v10
; GCN-NEXT: v_mov_b32_e32 v3, v11
; GCN-NEXT: v_mov_b32_e32 v4, v12
; GCN-NEXT: v_mov_b32_e32 v5, v13
; GCN-NEXT: v_mov_b32_e32 v6, v14
; GCN-NEXT: v_mov_b32_e32 v7, v15
; GCN-NEXT: v_mov_b32_e32 v8, v16
; GCN-NEXT: v_mov_b32_e32 v9, v17
; GCN-NEXT: v_mov_b32_e32 v10, v18
; GCN-NEXT: v_mov_b32_e32 v11, v19
; GCN-NEXT: v_mov_b32_e32 v12, v20
; GCN-NEXT: v_mov_b32_e32 v13, v21
; GCN-NEXT: v_mov_b32_e32 v14, v22
; GCN-NEXT: v_mov_b32_e32 v15, v23
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
; --------------------------------------------------------------------
; Different input parameter classes
; --------------------------------------------------------------------
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: v_mov_b32_e32 v32, s0
; GCN-NEXT: v_mov_b32_e32 v33, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v16
; GCN-NEXT: v_mov_b32_e32 v1, v17
; GCN-NEXT: v_mov_b32_e32 v2, v18
; GCN-NEXT: v_mov_b32_e32 v3, v19
; GCN-NEXT: v_mov_b32_e32 v4, v20
; GCN-NEXT: v_mov_b32_e32 v5, v21
; GCN-NEXT: v_mov_b32_e32 v6, v22
; GCN-NEXT: v_mov_b32_e32 v7, v23
; GCN-NEXT: v_mov_b32_e32 v8, v24
; GCN-NEXT: v_mov_b32_e32 v9, v25
; GCN-NEXT: v_mov_b32_e32 v10, v26
; GCN-NEXT: v_mov_b32_e32 v11, v27
; GCN-NEXT: v_mov_b32_e32 v12, v28
; GCN-NEXT: v_mov_b32_e32 v13, v29
; GCN-NEXT: v_mov_b32_e32 v14, v30
; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: scratch_load_dword v32, off, s32 offset:4
; GCN-NEXT: v_mov_b32_e32 v33, s0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v16
; GCN-NEXT: v_mov_b32_e32 v1, v17
; GCN-NEXT: v_mov_b32_e32 v2, v18
; GCN-NEXT: v_mov_b32_e32 v3, v19
; GCN-NEXT: v_mov_b32_e32 v4, v20
; GCN-NEXT: v_mov_b32_e32 v5, v21
; GCN-NEXT: v_mov_b32_e32 v6, v22
; GCN-NEXT: v_mov_b32_e32 v7, v23
; GCN-NEXT: v_mov_b32_e32 v8, v24
; GCN-NEXT: v_mov_b32_e32 v9, v25
; GCN-NEXT: v_mov_b32_e32 v10, v26
; GCN-NEXT: v_mov_b32_e32 v11, v27
; GCN-NEXT: v_mov_b32_e32 v12, v28
; GCN-NEXT: v_mov_b32_e32 v13, v29
; GCN-NEXT: v_mov_b32_e32 v14, v30
; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: scratch_load_dword v32, off, s32 offset:4
; GCN-NEXT: v_mov_b32_e32 v33, s0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v16
; GCN-NEXT: v_mov_b32_e32 v1, v17
; GCN-NEXT: v_mov_b32_e32 v2, v18
; GCN-NEXT: v_mov_b32_e32 v3, v19
; GCN-NEXT: v_mov_b32_e32 v4, v20
; GCN-NEXT: v_mov_b32_e32 v5, v21
; GCN-NEXT: v_mov_b32_e32 v6, v22
; GCN-NEXT: v_mov_b32_e32 v7, v23
; GCN-NEXT: v_mov_b32_e32 v8, v24
; GCN-NEXT: v_mov_b32_e32 v9, v25
; GCN-NEXT: v_mov_b32_e32 v10, v26
; GCN-NEXT: v_mov_b32_e32 v11, v27
; GCN-NEXT: v_mov_b32_e32 v12, v28
; GCN-NEXT: v_mov_b32_e32 v13, v29
; GCN-NEXT: v_mov_b32_e32 v14, v30
; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v26, s0
; SDAG-NEXT: v_mov_b32_e32 v27, s1
; SDAG-NEXT: v_mov_b32_e32 v28, s2
; SDAG-NEXT: v_mov_b32_e32 v29, s3
; SDAG-NEXT: v_mov_b32_e32 v30, s16
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
; SDAG-NEXT: v_readfirstlane_b32 s4, v13
; SDAG-NEXT: v_readfirstlane_b32 s5, v12
; SDAG-NEXT: v_readfirstlane_b32 s6, v11
; SDAG-NEXT: v_readfirstlane_b32 s7, v10
; SDAG-NEXT: v_readfirstlane_b32 s8, v9
; SDAG-NEXT: v_readfirstlane_b32 s9, v8
; SDAG-NEXT: v_readfirstlane_b32 s10, v7
; SDAG-NEXT: v_readfirstlane_b32 s11, v6
; SDAG-NEXT: v_readfirstlane_b32 s12, v5
; SDAG-NEXT: v_readfirstlane_b32 s13, v4
; SDAG-NEXT: v_readfirstlane_b32 s14, v3
; SDAG-NEXT: v_readfirstlane_b32 s15, v2
; SDAG-NEXT: v_readfirstlane_b32 s40, v1
; SDAG-NEXT: v_readfirstlane_b32 s41, v0
; SDAG-NEXT: v_mov_b32_e32 v16, v15
; SDAG-NEXT: v_mov_b32_e32 v17, v14
; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: v_mov_b32_e32 v1, s29
; SDAG-NEXT: v_mov_b32_e32 v2, s41
; SDAG-NEXT: v_mov_b32_e32 v3, s40
; SDAG-NEXT: v_mov_b32_e32 v4, s15
; SDAG-NEXT: v_mov_b32_e32 v5, s14
; SDAG-NEXT: v_mov_b32_e32 v6, s13
; SDAG-NEXT: v_mov_b32_e32 v7, s12
; SDAG-NEXT: v_mov_b32_e32 v8, s11
; SDAG-NEXT: v_mov_b32_e32 v9, s10
; SDAG-NEXT: v_mov_b32_e32 v10, s9
; SDAG-NEXT: v_mov_b32_e32 v11, s8
; SDAG-NEXT: v_mov_b32_e32 v12, s7
; SDAG-NEXT: v_mov_b32_e32 v13, s6
; SDAG-NEXT: v_mov_b32_e32 v14, s5
; SDAG-NEXT: v_mov_b32_e32 v15, s4
; SDAG-NEXT: v_mov_b32_e32 v18, s20
; SDAG-NEXT: v_mov_b32_e32 v19, s21
; SDAG-NEXT: v_mov_b32_e32 v20, s22
; SDAG-NEXT: v_mov_b32_e32 v21, s23
; SDAG-NEXT: v_mov_b32_e32 v22, s24
; SDAG-NEXT: v_mov_b32_e32 v23, s25
; SDAG-NEXT: v_mov_b32_e32 v24, s26
; SDAG-NEXT: v_mov_b32_e32 v25, s27
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v17, v16 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GISEL-NEXT: scratch_store_dword off, v34, s32 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: v_writelane_b32 v34, s36, 0
; GISEL-NEXT: v_writelane_b32 v34, s37, 1
; GISEL-NEXT: v_writelane_b32 v34, s38, 2
; GISEL-NEXT: v_writelane_b32 v34, s39, 3
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
; GISEL-NEXT: v_writelane_b32 v34, s48, 4
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[12:13]
; GISEL-NEXT: v_writelane_b32 v34, s49, 5
; GISEL-NEXT: v_writelane_b32 v34, s50, 6
; GISEL-NEXT: s_mov_b32 s36, s28
; GISEL-NEXT: s_mov_b32 s37, s29
; GISEL-NEXT: v_writelane_b32 v34, s51, 7
; GISEL-NEXT: v_mov_b32_e32 v16, v14
; GISEL-NEXT: v_mov_b32_e32 v17, v15
; GISEL-NEXT: v_readfirstlane_b32 s38, v0
; GISEL-NEXT: v_readfirstlane_b32 s39, v1
; GISEL-NEXT: v_readfirstlane_b32 s40, v2
; GISEL-NEXT: v_readfirstlane_b32 s41, v3
; GISEL-NEXT: v_readfirstlane_b32 s42, v4
; GISEL-NEXT: v_readfirstlane_b32 s43, v5
; GISEL-NEXT: v_readfirstlane_b32 s44, v6
; GISEL-NEXT: v_readfirstlane_b32 s45, v7
; GISEL-NEXT: v_readfirstlane_b32 s46, v8
; GISEL-NEXT: v_readfirstlane_b32 s47, v9
; GISEL-NEXT: v_readfirstlane_b32 s48, v10
; GISEL-NEXT: v_readfirstlane_b32 s49, v11
; GISEL-NEXT: v_readfirstlane_b32 s50, v12
; GISEL-NEXT: v_readfirstlane_b32 s51, v13
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; GISEL-NEXT: v_readlane_b32 s51, v34, 7
; GISEL-NEXT: v_readlane_b32 s50, v34, 6
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v16, v17 op_sel_hi:[0,0,0]
; GISEL-NEXT: v_readlane_b32 s49, v34, 5
; GISEL-NEXT: v_readlane_b32 s48, v34, 4
; GISEL-NEXT: v_readlane_b32 s39, v34, 3
; GISEL-NEXT: v_readlane_b32 s38, v34, 2
; GISEL-NEXT: v_readlane_b32 s37, v34, 1
; GISEL-NEXT: v_readlane_b32 s36, v34, 0
; GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GISEL-NEXT: scratch_load_dword v34, off, s32 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[0:1]
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v26, s0
; SDAG-NEXT: v_mov_b32_e32 v27, s1
; SDAG-NEXT: v_mov_b32_e32 v28, s2
; SDAG-NEXT: v_mov_b32_e32 v29, s3
; SDAG-NEXT: v_mov_b32_e32 v30, s16
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
; SDAG-NEXT: v_mov_b32_e32 v25, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v25, v24 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v8
; SDAG-NEXT: v_mov_b32_e32 v1, v9
; SDAG-NEXT: v_mov_b32_e32 v2, v10
; SDAG-NEXT: v_mov_b32_e32 v3, v11
; SDAG-NEXT: v_mov_b32_e32 v4, v12
; SDAG-NEXT: v_mov_b32_e32 v5, v13
; SDAG-NEXT: v_mov_b32_e32 v6, v14
; SDAG-NEXT: v_mov_b32_e32 v7, v15
; SDAG-NEXT: v_mov_b32_e32 v8, v16
; SDAG-NEXT: v_mov_b32_e32 v9, v17
; SDAG-NEXT: v_mov_b32_e32 v10, v18
; SDAG-NEXT: v_mov_b32_e32 v11, v19
; SDAG-NEXT: v_mov_b32_e32 v12, v20
; SDAG-NEXT: v_mov_b32_e32 v13, v21
; SDAG-NEXT: v_mov_b32_e32 v14, v22
; SDAG-NEXT: v_mov_b32_e32 v15, v23
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
; GISEL-NEXT: v_mov_b32_e32 v25, s20
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v25, v24 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v8
; GISEL-NEXT: v_mov_b32_e32 v1, v9
; GISEL-NEXT: v_mov_b32_e32 v2, v10
; GISEL-NEXT: v_mov_b32_e32 v3, v11
; GISEL-NEXT: v_mov_b32_e32 v4, v12
; GISEL-NEXT: v_mov_b32_e32 v5, v13
; GISEL-NEXT: v_mov_b32_e32 v6, v14
; GISEL-NEXT: v_mov_b32_e32 v7, v15
; GISEL-NEXT: v_mov_b32_e32 v8, v16
; GISEL-NEXT: v_mov_b32_e32 v9, v17
; GISEL-NEXT: v_mov_b32_e32 v10, v18
; GISEL-NEXT: v_mov_b32_e32 v11, v19
; GISEL-NEXT: v_mov_b32_e32 v12, v20
; GISEL-NEXT: v_mov_b32_e32 v13, v21
; GISEL-NEXT: v_mov_b32_e32 v14, v22
; GISEL-NEXT: v_mov_b32_e32 v15, v23
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v26, s0
; SDAG-NEXT: v_mov_b32_e32 v27, s1
; SDAG-NEXT: v_mov_b32_e32 v28, s2
; SDAG-NEXT: v_mov_b32_e32 v29, s3
; SDAG-NEXT: v_mov_b32_e32 v30, s16
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
; SDAG-NEXT: v_mov_b32_e32 v25, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v24, v25 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v8
; SDAG-NEXT: v_mov_b32_e32 v1, v9
; SDAG-NEXT: v_mov_b32_e32 v2, v10
; SDAG-NEXT: v_mov_b32_e32 v3, v11
; SDAG-NEXT: v_mov_b32_e32 v4, v12
; SDAG-NEXT: v_mov_b32_e32 v5, v13
; SDAG-NEXT: v_mov_b32_e32 v6, v14
; SDAG-NEXT: v_mov_b32_e32 v7, v15
; SDAG-NEXT: v_mov_b32_e32 v8, v16
; SDAG-NEXT: v_mov_b32_e32 v9, v17
; SDAG-NEXT: v_mov_b32_e32 v10, v18
; SDAG-NEXT: v_mov_b32_e32 v11, v19
; SDAG-NEXT: v_mov_b32_e32 v12, v20
; SDAG-NEXT: v_mov_b32_e32 v13, v21
; SDAG-NEXT: v_mov_b32_e32 v14, v22
; SDAG-NEXT: v_mov_b32_e32 v15, v23
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
; GISEL-NEXT: v_mov_b32_e32 v25, s20
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v24, v25 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v8
; GISEL-NEXT: v_mov_b32_e32 v1, v9
; GISEL-NEXT: v_mov_b32_e32 v2, v10
; GISEL-NEXT: v_mov_b32_e32 v3, v11
; GISEL-NEXT: v_mov_b32_e32 v4, v12
; GISEL-NEXT: v_mov_b32_e32 v5, v13
; GISEL-NEXT: v_mov_b32_e32 v6, v14
; GISEL-NEXT: v_mov_b32_e32 v7, v15
; GISEL-NEXT: v_mov_b32_e32 v8, v16
; GISEL-NEXT: v_mov_b32_e32 v9, v17
; GISEL-NEXT: v_mov_b32_e32 v10, v18
; GISEL-NEXT: v_mov_b32_e32 v11, v19
; GISEL-NEXT: v_mov_b32_e32 v12, v20
; GISEL-NEXT: v_mov_b32_e32 v13, v21
; GISEL-NEXT: v_mov_b32_e32 v14, v22
; GISEL-NEXT: v_mov_b32_e32 v15, v23
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v26, s0
; SDAG-NEXT: v_mov_b32_e32 v27, s1
; SDAG-NEXT: v_mov_b32_e32 v28, s2
; SDAG-NEXT: v_mov_b32_e32 v29, s3
; SDAG-NEXT: v_mov_b32_e32 v30, s16
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
; SDAG-NEXT: v_mov_b32_e32 v25, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[0:7], v[26:33], v[8:23], v24, v25 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v8
; SDAG-NEXT: v_mov_b32_e32 v1, v9
; SDAG-NEXT: v_mov_b32_e32 v2, v10
; SDAG-NEXT: v_mov_b32_e32 v3, v11
; SDAG-NEXT: v_mov_b32_e32 v4, v12
; SDAG-NEXT: v_mov_b32_e32 v5, v13
; SDAG-NEXT: v_mov_b32_e32 v6, v14
; SDAG-NEXT: v_mov_b32_e32 v7, v15
; SDAG-NEXT: v_mov_b32_e32 v8, v16
; SDAG-NEXT: v_mov_b32_e32 v9, v17
; SDAG-NEXT: v_mov_b32_e32 v10, v18
; SDAG-NEXT: v_mov_b32_e32 v11, v19
; SDAG-NEXT: v_mov_b32_e32 v12, v20
; SDAG-NEXT: v_mov_b32_e32 v13, v21
; SDAG-NEXT: v_mov_b32_e32 v14, v22
; SDAG-NEXT: v_mov_b32_e32 v15, v23
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
; GISEL-NEXT: v_mov_b32_e32 v25, s20
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[0:7], v[26:33], v[8:23], v24, v25 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v8
; GISEL-NEXT: v_mov_b32_e32 v1, v9
; GISEL-NEXT: v_mov_b32_e32 v2, v10
; GISEL-NEXT: v_mov_b32_e32 v3, v11
; GISEL-NEXT: v_mov_b32_e32 v4, v12
; GISEL-NEXT: v_mov_b32_e32 v5, v13
; GISEL-NEXT: v_mov_b32_e32 v6, v14
; GISEL-NEXT: v_mov_b32_e32 v7, v15
; GISEL-NEXT: v_mov_b32_e32 v8, v16
; GISEL-NEXT: v_mov_b32_e32 v9, v17
; GISEL-NEXT: v_mov_b32_e32 v10, v18
; GISEL-NEXT: v_mov_b32_e32 v11, v19
; GISEL-NEXT: v_mov_b32_e32 v12, v20
; GISEL-NEXT: v_mov_b32_e32 v13, v21
; GISEL-NEXT: v_mov_b32_e32 v14, v22
; GISEL-NEXT: v_mov_b32_e32 v15, v23
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v33, v7
; SDAG-NEXT: v_mov_b32_e32 v32, v6
; SDAG-NEXT: v_mov_b32_e32 v31, v5
; SDAG-NEXT: v_mov_b32_e32 v30, v4
; SDAG-NEXT: v_mov_b32_e32 v29, v3
; SDAG-NEXT: v_mov_b32_e32 v28, v2
; SDAG-NEXT: v_mov_b32_e32 v27, v1
; SDAG-NEXT: v_mov_b32_e32 v26, v0
; SDAG-NEXT: v_mov_b32_e32 v25, v15
; SDAG-NEXT: v_mov_b32_e32 v24, v14
; SDAG-NEXT: v_mov_b32_e32 v23, v13
; SDAG-NEXT: v_mov_b32_e32 v22, v12
; SDAG-NEXT: v_mov_b32_e32 v21, v11
; SDAG-NEXT: v_mov_b32_e32 v20, v10
; SDAG-NEXT: v_mov_b32_e32 v19, v9
; SDAG-NEXT: v_mov_b32_e32 v18, v8
; SDAG-NEXT: v_mov_b32_e32 v0, s0
; SDAG-NEXT: v_mov_b32_e32 v1, s1
; SDAG-NEXT: v_mov_b32_e32 v2, s2
; SDAG-NEXT: v_mov_b32_e32 v3, s3
; SDAG-NEXT: v_mov_b32_e32 v4, s16
; SDAG-NEXT: v_mov_b32_e32 v5, s17
; SDAG-NEXT: v_mov_b32_e32 v6, s18
; SDAG-NEXT: v_mov_b32_e32 v7, s19
; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: v_mov_b32_e32 v9, s21
; SDAG-NEXT: v_mov_b32_e32 v10, s22
; SDAG-NEXT: v_mov_b32_e32 v11, s23
; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v17, s28
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v16, v17 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v18, v0
; GISEL-NEXT: v_mov_b32_e32 v19, v1
; GISEL-NEXT: v_mov_b32_e32 v20, v2
; GISEL-NEXT: v_mov_b32_e32 v21, v3
; GISEL-NEXT: v_mov_b32_e32 v22, v4
; GISEL-NEXT: v_mov_b32_e32 v23, v5
; GISEL-NEXT: v_mov_b32_e32 v24, v6
; GISEL-NEXT: v_mov_b32_e32 v25, v7
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: v_mov_b32_e32 v26, v8
; GISEL-NEXT: v_mov_b32_e32 v27, v9
; GISEL-NEXT: v_mov_b32_e32 v28, v10
; GISEL-NEXT: v_mov_b32_e32 v29, v11
; GISEL-NEXT: v_mov_b32_e32 v30, v12
; GISEL-NEXT: v_mov_b32_e32 v31, v13
; GISEL-NEXT: v_mov_b32_e32 v32, v14
; GISEL-NEXT: v_mov_b32_e32 v33, v15
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[26:27]
; GISEL-NEXT: v_mov_b32_e32 v17, s28
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v16, v17 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v26, s0
; SDAG-NEXT: v_mov_b32_e32 v27, s1
; SDAG-NEXT: v_mov_b32_e32 v28, s2
; SDAG-NEXT: v_mov_b32_e32 v29, s3
; SDAG-NEXT: v_mov_b32_e32 v30, s16
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
; SDAG-NEXT: v_mov_b32_e32 v16, v15
; SDAG-NEXT: v_readfirstlane_b32 s4, v13
; SDAG-NEXT: v_readfirstlane_b32 s5, v12
; SDAG-NEXT: v_readfirstlane_b32 s6, v11
; SDAG-NEXT: v_readfirstlane_b32 s7, v10
; SDAG-NEXT: v_readfirstlane_b32 s8, v9
; SDAG-NEXT: v_readfirstlane_b32 s9, v8
; SDAG-NEXT: v_readfirstlane_b32 s0, v16
; SDAG-NEXT: v_mov_b32_e32 v17, v14
; SDAG-NEXT: v_mov_b32_e32 v25, v7
; SDAG-NEXT: v_mov_b32_e32 v24, v6
; SDAG-NEXT: v_mov_b32_e32 v23, v5
; SDAG-NEXT: v_mov_b32_e32 v22, v4
; SDAG-NEXT: v_mov_b32_e32 v21, v3
; SDAG-NEXT: v_mov_b32_e32 v20, v2
; SDAG-NEXT: v_mov_b32_e32 v19, v1
; SDAG-NEXT: v_mov_b32_e32 v18, v0
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
; SDAG-NEXT: v_mov_b32_e32 v3, s23
; SDAG-NEXT: v_mov_b32_e32 v4, s24
; SDAG-NEXT: v_mov_b32_e32 v5, s25
; SDAG-NEXT: v_mov_b32_e32 v6, s26
; SDAG-NEXT: v_mov_b32_e32 v7, s27
; SDAG-NEXT: v_mov_b32_e32 v8, s28
; SDAG-NEXT: v_mov_b32_e32 v9, s29
; SDAG-NEXT: v_mov_b32_e32 v10, s9
; SDAG-NEXT: v_mov_b32_e32 v11, s8
; SDAG-NEXT: v_mov_b32_e32 v12, s7
; SDAG-NEXT: v_mov_b32_e32 v13, s6
; SDAG-NEXT: v_mov_b32_e32 v14, s5
; SDAG-NEXT: v_mov_b32_e32 v15, s4
; SDAG-NEXT: v_mov_b32_e32 v16, s0
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v17, v16 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GISEL-NEXT: scratch_store_dword off, v34, s32 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: v_writelane_b32 v34, s36, 0
; GISEL-NEXT: v_writelane_b32 v34, s37, 1
; GISEL-NEXT: v_writelane_b32 v34, s38, 2
; GISEL-NEXT: v_writelane_b32 v34, s39, 3
; GISEL-NEXT: s_mov_b32 s12, s0
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
; GISEL-NEXT: v_writelane_b32 v34, s48, 4
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
; GISEL-NEXT: v_writelane_b32 v34, s49, 5
; GISEL-NEXT: v_writelane_b32 v34, s50, 6
; GISEL-NEXT: s_mov_b32 s36, s20
; GISEL-NEXT: s_mov_b32 s37, s21
; GISEL-NEXT: v_writelane_b32 v34, s51, 7
; GISEL-NEXT: v_mov_b32_e32 v18, v0
; GISEL-NEXT: v_mov_b32_e32 v19, v1
; GISEL-NEXT: v_mov_b32_e32 v20, v2
; GISEL-NEXT: v_mov_b32_e32 v21, v3
; GISEL-NEXT: v_mov_b32_e32 v22, v4
; GISEL-NEXT: v_mov_b32_e32 v23, v5
; GISEL-NEXT: v_mov_b32_e32 v24, v6
; GISEL-NEXT: v_mov_b32_e32 v25, v7
; GISEL-NEXT: s_mov_b32 s38, s22
; GISEL-NEXT: s_mov_b32 s39, s23
; GISEL-NEXT: s_mov_b32 s40, s24
; GISEL-NEXT: s_mov_b32 s41, s25
; GISEL-NEXT: s_mov_b32 s42, s26
; GISEL-NEXT: s_mov_b32 s43, s27
; GISEL-NEXT: s_mov_b32 s44, s28
; GISEL-NEXT: s_mov_b32 s45, s29
; GISEL-NEXT: v_mov_b32_e32 v16, v14
; GISEL-NEXT: v_readfirstlane_b32 s46, v8
; GISEL-NEXT: v_readfirstlane_b32 s47, v9
; GISEL-NEXT: v_readfirstlane_b32 s48, v10
; GISEL-NEXT: v_readfirstlane_b32 s49, v11
; GISEL-NEXT: v_readfirstlane_b32 s50, v12
; GISEL-NEXT: v_readfirstlane_b32 s51, v13
; GISEL-NEXT: v_readfirstlane_b32 s0, v15
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; GISEL-NEXT: v_mov_b32_e32 v17, s0
; GISEL-NEXT: v_readlane_b32 s51, v34, 7
; GISEL-NEXT: v_readlane_b32 s50, v34, 6
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v16, v17 op_sel_hi:[0,0,0]
; GISEL-NEXT: v_readlane_b32 s49, v34, 5
; GISEL-NEXT: v_readlane_b32 s48, v34, 4
; GISEL-NEXT: v_readlane_b32 s39, v34, 3
; GISEL-NEXT: v_readlane_b32 s38, v34, 2
; GISEL-NEXT: v_readlane_b32 s37, v34, 1
; GISEL-NEXT: v_readlane_b32 s36, v34, 0
; GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
; GISEL-NEXT: scratch_load_dword v34, off, s32 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[0:1]
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: v_mov_b32_e32 v32, -2
; SDAG-NEXT: v_mov_b32_e32 v33, 33
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: v_mov_b32_e32 v32, 33
; GISEL-NEXT: v_mov_b32_e32 v33, -2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: v_mov_b32_e32 v32, -2
; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
; GISEL-NEXT: v_mov_b32_e32 v33, -2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: v_mov_b32_e32 v32, 1.0
; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
; GISEL-NEXT: v_mov_b32_e32 v33, 1.0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: v_mov_b32_e32 v32, -2
; SDAG-NEXT: v_mov_b32_e32 v33, 1.0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: v_mov_b32_e32 v32, 1.0
; GISEL-NEXT: v_mov_b32_e32 v33, -2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: v_mov_b32_e32 v32, 1.0
; SDAG-NEXT: v_mov_b32_e32 v33, 0.15915494
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: v_mov_b32_e32 v32, 0.15915494
; GISEL-NEXT: v_mov_b32_e32 v33, 1.0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: v_mov_b32_e32 v32, 0x4d
; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
; GISEL-NEXT: v_mov_b32_e32 v33, 0x4d
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
ret <16 x float> %result
}
define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
; SDAG-NEXT: v_mov_b32_e32 v16, s8
; SDAG-NEXT: v_mov_b32_e32 v17, s9
; SDAG-NEXT: v_mov_b32_e32 v18, s10
; SDAG-NEXT: v_mov_b32_e32 v19, s11
; SDAG-NEXT: v_mov_b32_e32 v20, s12
; SDAG-NEXT: v_mov_b32_e32 v21, s13
; SDAG-NEXT: v_mov_b32_e32 v22, s14
; SDAG-NEXT: v_mov_b32_e32 v23, s15
; SDAG-NEXT: v_mov_b32_e32 v24, s16
; SDAG-NEXT: v_mov_b32_e32 v25, s17
; SDAG-NEXT: v_mov_b32_e32 v26, s18
; SDAG-NEXT: v_mov_b32_e32 v27, s19
; SDAG-NEXT: v_mov_b32_e32 v28, s20
; SDAG-NEXT: v_mov_b32_e32 v29, s21
; SDAG-NEXT: v_mov_b32_e32 v30, s22
; SDAG-NEXT: v_mov_b32_e32 v31, s23
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; SDAG-NEXT: v_mov_b32_e32 v32, s0
; SDAG-NEXT: v_mov_b32_e32 v33, s1
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; GISEL-NEXT: v_mov_b32_e32 v32, s0
; GISEL-NEXT: v_mov_b32_e32 v33, s1
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
ret void
}
define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; SDAG-NEXT: v_mov_b32_e32 v32, -2
; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v16, s8
; SDAG-NEXT: v_mov_b32_e32 v17, s9
; SDAG-NEXT: v_mov_b32_e32 v18, s10
; SDAG-NEXT: v_mov_b32_e32 v19, s11
; SDAG-NEXT: v_mov_b32_e32 v20, s12
; SDAG-NEXT: v_mov_b32_e32 v21, s13
; SDAG-NEXT: v_mov_b32_e32 v22, s14
; SDAG-NEXT: v_mov_b32_e32 v23, s15
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
; SDAG-NEXT: v_mov_b32_e32 v24, s16
; SDAG-NEXT: v_mov_b32_e32 v25, s17
; SDAG-NEXT: v_mov_b32_e32 v26, s18
; SDAG-NEXT: v_mov_b32_e32 v27, s19
; SDAG-NEXT: v_mov_b32_e32 v28, s20
; SDAG-NEXT: v_mov_b32_e32 v29, s21
; SDAG-NEXT: v_mov_b32_e32 v30, s22
; SDAG-NEXT: v_mov_b32_e32 v31, s23
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 2
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
; GISEL-NEXT: v_mov_b32_e32 v33, -2
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
ret void
}
define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v18, s12
; SDAG-NEXT: v_mov_b32_e32 v19, s13
; SDAG-NEXT: v_mov_b32_e32 v20, s14
; SDAG-NEXT: v_mov_b32_e32 v21, s15
; SDAG-NEXT: v_mov_b32_e32 v22, s16
; SDAG-NEXT: v_mov_b32_e32 v23, s17
; SDAG-NEXT: v_mov_b32_e32 v24, s18
; SDAG-NEXT: v_mov_b32_e32 v25, s19
; SDAG-NEXT: v_mov_b32_e32 v26, s20
; SDAG-NEXT: v_mov_b32_e32 v27, s21
; SDAG-NEXT: v_mov_b32_e32 v28, s22
; SDAG-NEXT: v_mov_b32_e32 v29, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; SDAG-NEXT: v_mov_b32_e32 v30, s24
; SDAG-NEXT: v_mov_b32_e32 v31, s25
; SDAG-NEXT: v_mov_b32_e32 v32, s26
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b32_e32 v33, s27
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: v_mov_b32_e32 v16, s0
; SDAG-NEXT: v_mov_b32_e32 v17, s1
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v16, v17 op_sel_hi:[0,0,0]
; SDAG-NEXT: v_mov_b32_e32 v18, s20
; SDAG-NEXT: v_mov_b32_e32 v19, s21
; SDAG-NEXT: v_mov_b32_e32 v20, s22
; SDAG-NEXT: v_mov_b32_e32 v21, s23
; SDAG-NEXT: v_mov_b64_e32 v[16:17], 48
; SDAG-NEXT: global_store_dwordx4 v[16:17], v[18:21], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v22, s18
; SDAG-NEXT: v_mov_b32_e32 v23, s19
; SDAG-NEXT: v_mov_b32_e32 v20, s16
; SDAG-NEXT: v_mov_b32_e32 v21, s17
; SDAG-NEXT: v_mov_b64_e32 v[18:19], 32
; SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s14
; SDAG-NEXT: v_mov_b32_e32 v25, s15
; SDAG-NEXT: v_mov_b32_e32 v22, s12
; SDAG-NEXT: v_mov_b32_e32 v23, s13
; SDAG-NEXT: v_mov_b64_e32 v[20:21], 16
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[22:25], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v26, s10
; SDAG-NEXT: v_mov_b32_e32 v27, s11
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b64_e32 v[22:23], 0
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[18:19], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[16:17], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v36, s0
; GISEL-NEXT: v_mov_b32_e32 v37, s1
; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v36, v37 op_sel_hi:[0,0,0]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
store volatile <16 x float> %result, ptr addrspace(1) null, align 64
ret void
}
define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #1 {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: v_mov_b32_e32 v16, 42
; SDAG-NEXT: v_mov_b32_e32 v17, 25
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v18, s12
; SDAG-NEXT: v_mov_b32_e32 v19, s13
; SDAG-NEXT: v_mov_b32_e32 v20, s14
; SDAG-NEXT: v_mov_b32_e32 v21, s15
; SDAG-NEXT: v_mov_b32_e32 v22, s16
; SDAG-NEXT: v_mov_b32_e32 v23, s17
; SDAG-NEXT: v_mov_b32_e32 v24, s18
; SDAG-NEXT: v_mov_b32_e32 v25, s19
; SDAG-NEXT: v_mov_b32_e32 v26, s20
; SDAG-NEXT: v_mov_b32_e32 v27, s21
; SDAG-NEXT: v_mov_b32_e32 v28, s22
; SDAG-NEXT: v_mov_b32_e32 v29, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; SDAG-NEXT: v_mov_b32_e32 v30, s24
; SDAG-NEXT: v_mov_b32_e32 v31, s25
; SDAG-NEXT: v_mov_b32_e32 v32, s26
; SDAG-NEXT: v_mov_b32_e32 v33, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2
; SDAG-NEXT: v_mov_b32_e32 v18, s20
; SDAG-NEXT: v_mov_b32_e32 v19, s21
; SDAG-NEXT: v_mov_b32_e32 v20, s22
; SDAG-NEXT: v_mov_b32_e32 v21, s23
; SDAG-NEXT: v_mov_b64_e32 v[16:17], 48
; SDAG-NEXT: global_store_dwordx4 v[16:17], v[18:21], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v22, s18
; SDAG-NEXT: v_mov_b32_e32 v23, s19
; SDAG-NEXT: v_mov_b32_e32 v20, s16
; SDAG-NEXT: v_mov_b32_e32 v21, s17
; SDAG-NEXT: v_mov_b64_e32 v[18:19], 32
; SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s14
; SDAG-NEXT: v_mov_b32_e32 v25, s15
; SDAG-NEXT: v_mov_b32_e32 v22, s12
; SDAG-NEXT: v_mov_b32_e32 v23, s13
; SDAG-NEXT: v_mov_b64_e32 v[20:21], 16
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[22:25], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v26, s10
; SDAG-NEXT: v_mov_b32_e32 v27, s11
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b64_e32 v[22:23], 0
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[18:19], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[16:17], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GISEL-NEXT: v_mov_b32_e32 v36, 25
; GISEL-NEXT: v_mov_b32_e32 v37, 42
; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v36, v37 op_sel_hi:[0,0,0] blgp:2
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
store volatile <16 x float> %result, ptr addrspace(1) null, align 64
ret void
}
define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v32, s12
; SDAG-NEXT: v_mov_b32_e32 v33, s13
; SDAG-NEXT: v_mov_b32_e32 v34, s14
; SDAG-NEXT: v_mov_b32_e32 v35, s15
; SDAG-NEXT: v_mov_b32_e32 v36, s16
; SDAG-NEXT: v_mov_b32_e32 v37, s17
; SDAG-NEXT: v_mov_b32_e32 v38, s18
; SDAG-NEXT: v_mov_b32_e32 v39, s19
; SDAG-NEXT: v_mov_b32_e32 v40, s20
; SDAG-NEXT: v_mov_b32_e32 v41, s21
; SDAG-NEXT: v_mov_b32_e32 v42, s22
; SDAG-NEXT: v_mov_b32_e32 v43, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; SDAG-NEXT: v_mov_b32_e32 v44, s24
; SDAG-NEXT: v_mov_b32_e32 v45, s25
; SDAG-NEXT: v_mov_b32_e32 v46, s26
; SDAG-NEXT: v_mov_b32_e32 v47, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
; SDAG-NEXT: s_nop 14
; SDAG-NEXT: v_mov_b32_e32 v16, s20
; SDAG-NEXT: v_mov_b32_e32 v17, s21
; SDAG-NEXT: v_mov_b32_e32 v18, s22
; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
; SDAG-NEXT: v_mov_b32_e32 v16, s12
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v16, s8
; SDAG-NEXT: v_mov_b32_e32 v17, s9
; SDAG-NEXT: v_mov_b32_e32 v18, s10
; SDAG-NEXT: v_mov_b32_e32 v19, s11
; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
store volatile <16 x float> %result, ptr addrspace(1) null, align 64
ret void
}
define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: v_mov_b32_e32 v32, 42
; SDAG-NEXT: v_mov_b32_e32 v33, 25
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v16, s12
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
; SDAG-NEXT: v_mov_b32_e32 v20, s16
; SDAG-NEXT: v_mov_b32_e32 v21, s17
; SDAG-NEXT: v_mov_b32_e32 v22, s18
; SDAG-NEXT: v_mov_b32_e32 v23, s19
; SDAG-NEXT: v_mov_b32_e32 v24, s20
; SDAG-NEXT: v_mov_b32_e32 v25, s21
; SDAG-NEXT: v_mov_b32_e32 v26, s22
; SDAG-NEXT: v_mov_b32_e32 v27, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; SDAG-NEXT: v_mov_b32_e32 v28, s24
; SDAG-NEXT: v_mov_b32_e32 v29, s25
; SDAG-NEXT: v_mov_b32_e32 v30, s26
; SDAG-NEXT: v_mov_b32_e32 v31, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
; SDAG-NEXT: v_mov_b32_e32 v16, s20
; SDAG-NEXT: v_mov_b32_e32 v17, s21
; SDAG-NEXT: v_mov_b32_e32 v18, s22
; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
; SDAG-NEXT: v_mov_b32_e32 v16, s12
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v16, s8
; SDAG-NEXT: v_mov_b32_e32 v17, s9
; SDAG-NEXT: v_mov_b32_e32 v18, s10
; SDAG-NEXT: v_mov_b32_e32 v19, s11
; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GISEL-NEXT: v_mov_b32_e32 v32, 25
; GISEL-NEXT: v_mov_b32_e32 v33, 42
; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 2
; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
store volatile <16 x float> %result, ptr addrspace(1) null, align 64
ret void
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v16
; GCN-NEXT: v_mov_b32_e32 v1, v17
; GCN-NEXT: v_mov_b32_e32 v2, v18
; GCN-NEXT: v_mov_b32_e32 v3, v19
; GCN-NEXT: v_mov_b32_e32 v4, v20
; GCN-NEXT: v_mov_b32_e32 v5, v21
; GCN-NEXT: v_mov_b32_e32 v6, v22
; GCN-NEXT: v_mov_b32_e32 v7, v23
; GCN-NEXT: v_mov_b32_e32 v8, v24
; GCN-NEXT: v_mov_b32_e32 v9, v25
; GCN-NEXT: v_mov_b32_e32 v10, v26
; GCN-NEXT: v_mov_b32_e32 v11, v27
; GCN-NEXT: v_mov_b32_e32 v12, v28
; GCN-NEXT: v_mov_b32_e32 v13, v29
; GCN-NEXT: v_mov_b32_e32 v14, v30
; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v16
; GCN-NEXT: v_mov_b32_e32 v1, v17
; GCN-NEXT: v_mov_b32_e32 v2, v18
; GCN-NEXT: v_mov_b32_e32 v3, v19
; GCN-NEXT: v_mov_b32_e32 v4, v20
; GCN-NEXT: v_mov_b32_e32 v5, v21
; GCN-NEXT: v_mov_b32_e32 v6, v22
; GCN-NEXT: v_mov_b32_e32 v7, v23
; GCN-NEXT: v_mov_b32_e32 v8, v24
; GCN-NEXT: v_mov_b32_e32 v9, v25
; GCN-NEXT: v_mov_b32_e32 v10, v26
; GCN-NEXT: v_mov_b32_e32 v11, v27
; GCN-NEXT: v_mov_b32_e32 v12, v28
; GCN-NEXT: v_mov_b32_e32 v13, v29
; GCN-NEXT: v_mov_b32_e32 v14, v30
; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: v_mov_b32_e32 v32, 1
; SDAG-NEXT: v_mov_b32_e32 v33, 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: v_mov_b32_e32 v32, 0
; GISEL-NEXT: v_mov_b32_e32 v33, 1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: v_mov_b32_e32 v32, 0
; SDAG-NEXT: v_mov_b32_e32 v33, 1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: v_mov_b32_e32 v32, 1
; GISEL-NEXT: v_mov_b32_e32 v33, 0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
ret <16 x float> %result
}
; --------------------------------------------------------------------
; Incorrect signature for format cases (IR vector too large)
; --------------------------------------------------------------------
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:2
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:2
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:2
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:2
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:2 blgp:2
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v16
; GCN-NEXT: v_mov_b32_e32 v1, v17
; GCN-NEXT: v_mov_b32_e32 v2, v18
; GCN-NEXT: v_mov_b32_e32 v3, v19
; GCN-NEXT: v_mov_b32_e32 v4, v20
; GCN-NEXT: v_mov_b32_e32 v5, v21
; GCN-NEXT: v_mov_b32_e32 v6, v22
; GCN-NEXT: v_mov_b32_e32 v7, v23
; GCN-NEXT: v_mov_b32_e32 v8, v24
; GCN-NEXT: v_mov_b32_e32 v9, v25
; GCN-NEXT: v_mov_b32_e32 v10, v26
; GCN-NEXT: v_mov_b32_e32 v11, v27
; GCN-NEXT: v_mov_b32_e32 v12, v28
; GCN-NEXT: v_mov_b32_e32 v13, v29
; GCN-NEXT: v_mov_b32_e32 v14, v30
; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:4
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:4
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:4
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:4
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] blgp:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_mov_b32_e32 v0, v14
; GCN-NEXT: v_mov_b32_e32 v1, v15
; GCN-NEXT: v_mov_b32_e32 v2, v16
; GCN-NEXT: v_mov_b32_e32 v3, v17
; GCN-NEXT: v_mov_b32_e32 v4, v18
; GCN-NEXT: v_mov_b32_e32 v5, v19
; GCN-NEXT: v_mov_b32_e32 v6, v20
; GCN-NEXT: v_mov_b32_e32 v7, v21
; GCN-NEXT: v_mov_b32_e32 v8, v22
; GCN-NEXT: v_mov_b32_e32 v9, v23
; GCN-NEXT: v_mov_b32_e32 v10, v24
; GCN-NEXT: v_mov_b32_e32 v11, v25
; GCN-NEXT: v_mov_b32_e32 v12, v26
; GCN-NEXT: v_mov_b32_e32 v13, v27
; GCN-NEXT: v_mov_b32_e32 v14, v28
; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword v31, off, s32
; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_mov_b32_e32 v0, v16
; SDAG-NEXT: v_mov_b32_e32 v1, v17
; SDAG-NEXT: v_mov_b32_e32 v2, v18
; SDAG-NEXT: v_mov_b32_e32 v3, v19
; SDAG-NEXT: v_mov_b32_e32 v4, v20
; SDAG-NEXT: v_mov_b32_e32 v5, v21
; SDAG-NEXT: v_mov_b32_e32 v6, v22
; SDAG-NEXT: v_mov_b32_e32 v7, v23
; SDAG-NEXT: v_mov_b32_e32 v8, v24
; SDAG-NEXT: v_mov_b32_e32 v9, v25
; SDAG-NEXT: v_mov_b32_e32 v10, v26
; SDAG-NEXT: v_mov_b32_e32 v11, v27
; SDAG-NEXT: v_mov_b32_e32 v12, v28
; SDAG-NEXT: v_mov_b32_e32 v13, v29
; SDAG-NEXT: v_mov_b32_e32 v14, v30
; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword v31, off, s32
; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_mov_b32_e32 v0, v16
; GISEL-NEXT: v_mov_b32_e32 v1, v17
; GISEL-NEXT: v_mov_b32_e32 v2, v18
; GISEL-NEXT: v_mov_b32_e32 v3, v19
; GISEL-NEXT: v_mov_b32_e32 v4, v20
; GISEL-NEXT: v_mov_b32_e32 v5, v21
; GISEL-NEXT: v_mov_b32_e32 v6, v22
; GISEL-NEXT: v_mov_b32_e32 v7, v23
; GISEL-NEXT: v_mov_b32_e32 v8, v24
; GISEL-NEXT: v_mov_b32_e32 v9, v25
; GISEL-NEXT: v_mov_b32_e32 v10, v26
; GISEL-NEXT: v_mov_b32_e32 v11, v27
; GISEL-NEXT: v_mov_b32_e32 v12, v28
; GISEL-NEXT: v_mov_b32_e32 v13, v29
; GISEL-NEXT: v_mov_b32_e32 v14, v30
; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:4 blgp:4
; GCN-NEXT: s_nop 11
; GCN-NEXT: v_mov_b32_e32 v0, v16
; GCN-NEXT: v_mov_b32_e32 v1, v17
; GCN-NEXT: v_mov_b32_e32 v2, v18
; GCN-NEXT: v_mov_b32_e32 v3, v19
; GCN-NEXT: v_mov_b32_e32 v4, v20
; GCN-NEXT: v_mov_b32_e32 v5, v21
; GCN-NEXT: v_mov_b32_e32 v6, v22
; GCN-NEXT: v_mov_b32_e32 v7, v23
; GCN-NEXT: v_mov_b32_e32 v8, v24
; GCN-NEXT: v_mov_b32_e32 v9, v25
; GCN-NEXT: v_mov_b32_e32 v10, v26
; GCN-NEXT: v_mov_b32_e32 v11, v27
; GCN-NEXT: v_mov_b32_e32 v12, v28
; GCN-NEXT: v_mov_b32_e32 v13, v29
; GCN-NEXT: v_mov_b32_e32 v14, v30
; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { "amdgpu-flat-work-group-size"="128,128" }
attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }