blob: b2465b02f2eeeaf000cf4d01f79681f57bfe1fa0 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mcpu=gfx950 -amdgpu-mfma-vgpr-form < %s | FileCheck %s
target triple = "amdgcn-amd-amdhsa"
define void @test_rewrite_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x32_f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %src2, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %src2, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_i32_16x16x64_i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x i32>, ptr addrspace(1) %ptr
%mai = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %src2, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<4 x i32> %mai)
ret void
}
define void @test_rewrite_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_i32_32x32x32_i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x i32>, ptr addrspace(1) %ptr
%mai = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %src2, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<16 x i32> %mai)
ret void
}
define void @test_rewrite_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x32_bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %src2, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %src2, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
; TODO: Full cross product of src0/src1 sizes not tested
define void @test_rewrite_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[18:19], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %src2,
i32 0, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_mfma_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[16:17], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %src2,
i32 0, ; cbsz
i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %src2,
i32 2, ; cbsz
i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_mfma_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[12:13], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %src2,
i32 2, ; cbsz
i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[10:11], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v8, v9 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %src2,
i32 4, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_mfma_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %src2,
i32 4, ; cbsz
i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[18:19], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[18:19], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[18:19], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[18:19], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %src2,
i32 0, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[16:17], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[16:17], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[16:17], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[16:17], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %src2,
i32 0, ; cbsz
i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v12, v13 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %src2,
i32 2, ; cbsz
i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[12:13], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[12:13], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[12:13], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[12:13], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %src2,
i32 2, ; cbsz
i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[10:11], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[10:11], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[10:11], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[10:11], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v8, v9 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %src2,
i32 4, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %src2,
i32 4, ; cbsz
i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_16x16x64_f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %src2, i32 %arg2, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_32x32x32_f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %src2, i32 %arg2, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_16x16x64_bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %src2, i32 %arg2, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_32x32x32_bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %src2, i32 %arg2, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_i32_16x16x128_i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x i32>, ptr addrspace(1) %ptr
%mai = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %src2, i32 %arg2, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<4 x i32> %mai)
ret void
}
define void @test_rewrite_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_i32_32x32x64_i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x i32>, ptr addrspace(1) %ptr
%mai = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %src2, i32 %arg2, i32 0, i32 0)
call void asm sideeffect "; use $0", "a"(<16 x i32> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_16x16x128_bf8_bf8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %src2, i32 %arg2, i32 1, i32 2)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_16x16x128_bf8_fp8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %src2, i32 %arg2, i32 1, i32 2)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_16x16x128_fp8_bf8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %src2, i32 %arg2, i32 1, i32 2)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_16x16x128_fp8_fp8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <4 x float>, ptr addrspace(1) %ptr
%mai = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %src2, i32 %arg2, i32 1, i32 2)
call void asm sideeffect "; use $0", "a"(<4 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_32x32x64_bf8_bf8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %src2, i32 %arg2, i32 1, i32 2)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_32x32x64_bf8_fp8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %src2, i32 %arg2, i32 1, i32 2)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_32x32x64_fp8_bf8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %src2, i32 %arg2, i32 1, i32 2)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
define void @test_rewrite_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, i32 %arg2, ptr addrspace(1) %ptr) #0 {
; CHECK-LABEL: test_rewrite_smfmac_f32_32x32x64_fp8_fp8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%src2 = load <16 x float>, ptr addrspace(1) %ptr
%mai = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %src2, i32 %arg2, i32 1, i32 2)
call void asm sideeffect "; use $0", "a"(<16 x float> %mai)
ret void
}
attributes #0 = { nounwind }