| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s |
| ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s |
| ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s |
| ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s |
| ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s |
| |
| declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) |
| |
| define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) #0 { |
| ; GFX908-LABEL: test_mfma_f32_32x32x1f32_vgpr: |
| ; GFX908: ; %bb.0: ; %bb |
| ; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 |
| ; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s16 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s17 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s18 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s21 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s22 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s23 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a6, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s24 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s25 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s26 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a8, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a9, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a10, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s27 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s28 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s29 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a11, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a12, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a13, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s30 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s31 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a14, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a15, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s5 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s6 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s7 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s8 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s9 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s19 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s10 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s11 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s12 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s20 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s13 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s14 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s15 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, 1.0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a4, v5 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 |
| ; GFX908-NEXT: s_endpgm |
| bb: |
| %in.1 = load <32 x float>, ptr addrspace(1) %arg |
| %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) |
| store <32 x float> %mai.1, ptr addrspace(1) %arg |
| ret void |
| } |
| |
| define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #2 { |
| ; GFX908-LABEL: test_mfma_f32_32x32x1f32_agpr: |
| ; GFX908: ; %bb.0: ; %bb |
| ; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 |
| ; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s16 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s17 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s18 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s21 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s22 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s23 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a6, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s24 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s25 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s26 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a8, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a9, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a10, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s27 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s28 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s29 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a11, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a12, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a13, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s30 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s31 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a14, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a15, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s5 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s6 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s7 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s8 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s9 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s19 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s10 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s11 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s12 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s20 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, s13 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, s14 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s15 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, 1.0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a4, v5 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 |
| ; GFX908-NEXT: s_endpgm |
| bb: |
| %in.1 = load <32 x float>, ptr addrspace(1) %arg |
| %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) |
| store <32 x float> %mai.1, ptr addrspace(1) %arg |
| ret void |
| } |
| |
| define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) { |
| ; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: |
| ; GFX908: ; %bb.0: ; %bb |
| ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX908-NEXT: v_mov_b32_e32 v32, 0 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; def a0 |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 |
| ; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 |
| ; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 |
| ; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 |
| ; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 |
| ; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 |
| ; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 |
| ; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 |
| ; GFX908-NEXT: s_endpgm |
| bb: |
| %acc = call i32 asm sideeffect "; def $0", "={a0}"() |
| %in.1 = load <32 x float>, ptr addrspace(1) %arg |
| %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) |
| store <32 x float> %mai.1, ptr addrspace(1) %arg |
| ret void |
| } |
| |
| define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) { |
| ; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: |
| ; GFX908: ; %bb.0: ; %bb |
| ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX908-NEXT: v_mov_b32_e32 v32, 0 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; use a[100:131] |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 |
| ; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 |
| ; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 |
| ; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 |
| ; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 |
| ; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 |
| ; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 |
| ; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 |
| ; GFX908-NEXT: s_endpgm |
| bb: |
| call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> poison) |
| %in.1 = load <32 x float>, ptr addrspace(1) %arg |
| %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) |
| store <32 x float> %mai.1, ptr addrspace(1) %arg |
| ret void |
| } |
| |
| define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addrspace(1) %arg) #0 { |
| ; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs: |
| ; GFX908: ; %bb.0: ; %bb |
| ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX908-NEXT: v_mov_b32_e32 v32, 0 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; def v0 |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 |
| ; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 |
| ; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 |
| ; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 |
| ; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 |
| ; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 |
| ; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 |
| ; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] |
| ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 |
| ; GFX908-NEXT: s_endpgm |
| bb: |
| %acc = call i32 asm sideeffect "; def $0", "={v0}"() |
| %in.1 = load <32 x float>, ptr addrspace(1) %arg |
| %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) |
| store <32 x float> %mai.1, ptr addrspace(1) %arg |
| ret void |
| } |
| |
| define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #1 { |
| ; GFX908-LABEL: test_mfma_f32_32x32x1f32_call: |
| ; GFX908: ; %bb.0: ; %bb |
| ; GFX908-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX908-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX908-NEXT: s_mov_b32 s38, -1 |
| ; GFX908-NEXT: s_mov_b32 s39, 0xe00000 |
| ; GFX908-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX908-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX908-NEXT: s_mov_b32 s12, s8 |
| ; GFX908-NEXT: s_add_u32 s8, s4, 44 |
| ; GFX908-NEXT: s_mov_b32 s13, s9 |
| ; GFX908-NEXT: s_addc_u32 s9, s5, 0 |
| ; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX908-NEXT: s_getpc_b64 s[4:5] |
| ; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 |
| ; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 |
| ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; GFX908-NEXT: s_mov_b32 s14, s10 |
| ; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] |
| ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 |
| ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 |
| ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] |
| ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] |
| ; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 |
| ; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX908-NEXT: s_mov_b32 s32, 0 |
| ; GFX908-NEXT: v_mov_b32_e32 v40, 0 |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; GFX908-NEXT: global_load_dwordx4 v[28:31], v40, s[34:35] offset:112 |
| ; GFX908-NEXT: global_load_dwordx4 v[24:27], v40, s[34:35] offset:96 |
| ; GFX908-NEXT: global_load_dwordx4 v[20:23], v40, s[34:35] offset:80 |
| ; GFX908-NEXT: global_load_dwordx4 v[16:19], v40, s[34:35] offset:64 |
| ; GFX908-NEXT: global_load_dwordx4 v[12:15], v40, s[34:35] offset:48 |
| ; GFX908-NEXT: global_load_dwordx4 v[8:11], v40, s[34:35] offset:32 |
| ; GFX908-NEXT: global_load_dwordx4 v[4:7], v40, s[34:35] offset:16 |
| ; GFX908-NEXT: global_load_dwordx4 v[0:3], v40, s[34:35] |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 |
| ; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 |
| ; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 |
| ; GFX908-NEXT: global_store_dwordx4 v40, v[4:7], s[34:35] offset:112 |
| ; GFX908-NEXT: global_store_dwordx4 v40, v[8:11], s[34:35] offset:64 |
| ; GFX908-NEXT: global_store_dwordx4 v40, v[12:15], s[34:35] offset:80 |
| ; GFX908-NEXT: global_store_dwordx4 v40, v[16:19], s[34:35] offset:32 |
| ; GFX908-NEXT: global_store_dwordx4 v40, v[20:23], s[34:35] offset:48 |
| ; GFX908-NEXT: global_store_dwordx4 v40, v[24:27], s[34:35] |
| ; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:16 |
| ; GFX908-NEXT: s_endpgm |
| bb: |
| call void @foo() |
| %in.1 = load <32 x float>, ptr addrspace(1) %arg |
| %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) |
| store <32 x float> %mai.1, ptr addrspace(1) %arg |
| ret void |
| } |
| |
| ; We could avoid scan to find calls since we see these during lowering before selection. |
| ; However, in SDag lowering and selection is done block by block, so it would only work |
| ; in Global ISel. |
| define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #1 { |
| ; GFX908-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb: |
| ; GFX908: ; %bb.0: ; %bb1 |
| ; GFX908-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 |
| ; GFX908-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 |
| ; GFX908-NEXT: s_mov_b32 s54, -1 |
| ; GFX908-NEXT: s_mov_b32 s55, 0xe00000 |
| ; GFX908-NEXT: s_add_u32 s52, s52, s11 |
| ; GFX908-NEXT: s_mov_b32 s14, s10 |
| ; GFX908-NEXT: s_mov_b32 s12, s8 |
| ; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] |
| ; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 |
| ; GFX908-NEXT: s_load_dword s8, s[4:5], 0x2c |
| ; GFX908-NEXT: v_mov_b32_e32 v6, 1.0 |
| ; GFX908-NEXT: v_mov_b32_e32 v7, 0 |
| ; GFX908-NEXT: s_addc_u32 s53, s53, 0 |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 |
| ; GFX908-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40 |
| ; GFX908-NEXT: s_bitcmp0_b32 s8, 0 |
| ; GFX908-NEXT: s_mov_b32 s32, 0 |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s36 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, s37 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s40 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a0, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a1, v4 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s38 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, s39 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a4, v5 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a2, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v4 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s41 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, s42 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s43 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a5, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a6, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a7, v5 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s44 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, s45 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s46 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a8, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a9, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a10, v5 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s47 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, s48 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s49 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a11, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a12, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a13, v5 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s50 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, s51 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s16 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a14, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a15, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v5 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s17 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, s18 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s19 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v5 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s20 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, s21 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s22 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v5 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s23 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, s24 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s25 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v5 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s26 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, s27 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s28 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v5 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s29 |
| ; GFX908-NEXT: v_mov_b32_e32 v4, s30 |
| ; GFX908-NEXT: v_mov_b32_e32 v5, s31 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v5 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3 |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a27 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a26 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a25 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a24 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:96 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a31 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a30 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a29 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a28 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:112 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a19 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a18 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a17 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a16 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:64 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a23 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a22 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a21 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a20 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:80 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a11 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a10 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a9 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a8 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:32 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a15 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a14 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a13 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a12 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:48 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a7 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a6 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a5 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a4 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:16 |
| ; GFX908-NEXT: s_cbranch_scc1 .LBB6_2 |
| ; GFX908-NEXT: ; %bb.1: ; %bb2 |
| ; GFX908-NEXT: s_add_u32 s8, s4, 48 |
| ; GFX908-NEXT: s_mov_b32 s13, s9 |
| ; GFX908-NEXT: s_addc_u32 s9, s5, 0 |
| ; GFX908-NEXT: s_getpc_b64 s[4:5] |
| ; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 |
| ; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 |
| ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 |
| ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 |
| ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] |
| ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] |
| ; GFX908-NEXT: s_mov_b64 s[0:1], s[52:53] |
| ; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 |
| ; GFX908-NEXT: s_mov_b64 s[2:3], s[54:55] |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; GFX908-NEXT: .LBB6_2: ; %bb3 |
| ; GFX908-NEXT: s_endpgm |
| bb1: |
| %in.1 = load <32 x float>, ptr addrspace(1) %arg |
| %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) |
| store <32 x float> %mai.1, ptr addrspace(1) %arg |
| br i1 %c0, label %bb2, label %bb3 |
| br label %bb2 |
| |
| bb2: |
| call void @foo() |
| br label %bb3 |
| |
| bb3: |
| ret void |
| } |
| |
| define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 { |
| ; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr: |
| ; GFX908: ; %bb.0: ; %bb |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 |
| ; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 |
| ; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 |
| ; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 |
| ; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 |
| ; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 |
| ; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 |
| ; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a1, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a5, v7 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a6, v8 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a7, v9 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a9, v11 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a10, v12 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a11, v13 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a13, v15 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a14, v16 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a15, v17 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v19 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v20 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v21 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v23 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v24 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v25 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v27 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v28 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v29 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v31 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v32 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v33 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %in.1 = load <32 x float>, ptr addrspace(1) %arg |
| %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) |
| store <32 x float> %mai.1, ptr addrspace(1) %arg |
| ret void |
| } |
| |
| define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) #3 { |
| ; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr: |
| ; GFX908: ; %bb.0: ; %bb |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 |
| ; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 |
| ; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 |
| ; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 |
| ; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 |
| ; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 |
| ; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 |
| ; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a1, v3 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a5, v7 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a6, v8 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a7, v9 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a9, v11 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a10, v12 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a11, v13 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a13, v15 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a14, v16 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a15, v17 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v19 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v20 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v21 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v23 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v24 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v25 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v27 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v28 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v29 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v31 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v32 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v33 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48 |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off |
| ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %in.1 = load <32 x float>, ptr addrspace(1) %arg |
| %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) |
| store <32 x float> %mai.1, ptr addrspace(1) %arg |
| ret void |
| } |
| |
| declare void @foo() |
| |
| attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "amdgpu-agpr-alloc"="0" } |
| attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } |
| attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0" } |
| attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } |
| ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: |
| ; GCN: {{.*}} |
| ; GFX90A: {{.*}} |