| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s |
| |
| ; This testcase would fail on GFX908 due to not having a free VGPR available to |
| ; copy between AGPRs. |
| define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { |
| ; GFX908-LABEL: no_free_vgprs_at_agpr_to_agpr_copy: |
| ; GFX908: ; %bb.0: |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX908-NEXT: v_mov_b32_e32 v32, v1 |
| ; GFX908-NEXT: v_mov_b32_e32 v33, v0 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; def v[0:31] a[0:15] |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a15 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a14 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a13 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a12 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a11 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a10 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a9 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a8 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a6 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a5 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a4 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a3 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a2 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a1 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v34 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v34 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31] |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a0 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a1 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_read_b32 v36, a13 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a14 ; Reload Reuse |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a2 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a3 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a4 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a5 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a6 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a7 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a8 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a9 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a10 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a15 ; Reload Reuse |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; copy |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a1 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v34 |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a0, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a1, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a2, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a4, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a5, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a6, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a7, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a8, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a9, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a10, v34 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_write_b32 a14, v35 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_write_b32 a15, v34 ; Reload Reuse |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; copy |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; use a3 v[0:31] |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX90A-LABEL: no_free_vgprs_at_agpr_to_agpr_copy: |
| ; GFX90A: ; %bb.0: |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-NEXT: v_mov_b32_e32 v33, v0 |
| ; GFX90A-NEXT: v_mov_b32_e32 v32, v1 |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; def v[0:31] a[0:15] |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a15 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a14 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a13 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a12 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a11 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a10 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a9 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a8 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a7 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a6 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a5 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a4 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a3 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a2 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a1 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 |
| ; GFX90A-NEXT: s_nop 1 |
| ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31] |
| ; GFX90A-NEXT: s_nop 7 |
| ; GFX90A-NEXT: s_nop 2 |
| ; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v39, a10 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v36, a13 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v35, a14 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v34, a15 ; Reload Reuse |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; copy |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a1 |
| ; GFX90A-NEXT: buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a10, v39 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a14, v35 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a15, v34 ; Reload Reuse |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; copy |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2 |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; use a3 v[0:31] |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: s_setpc_b64 s[30:31] |
| %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${a[0:15]}"() |
| %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0 |
| %agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1 |
| %mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0) |
| %agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0) |
| %agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma) |
| call void asm sideeffect "; use $0 $1","{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0) |
| ret void |
| } |
| |
| ; Check that we do make use of v32 if there are no AGPRs present in the function |
| define amdgpu_kernel void @no_agpr_no_reserve(<32 x i32> addrspace(1)* %arg) #0 { |
| ; GFX908-LABEL: no_agpr_no_reserve: |
| ; GFX908: ; %bb.0: |
| ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 |
| ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 7, v0 |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 |
| ; GFX908-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] |
| ; GFX908-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 |
| ; GFX908-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 |
| ; GFX908-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 |
| ; GFX908-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 |
| ; GFX908-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 |
| ; GFX908-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 |
| ; GFX908-NEXT: s_waitcnt vmcnt(7) |
| ; GFX908-NEXT: v_add_u32_e32 v4, v4, v4 |
| ; GFX908-NEXT: v_add_u32_e32 v3, v3, v3 |
| ; GFX908-NEXT: v_add_u32_e32 v2, v2, v2 |
| ; GFX908-NEXT: v_add_u32_e32 v1, v1, v1 |
| ; GFX908-NEXT: s_waitcnt vmcnt(6) |
| ; GFX908-NEXT: v_add_u32_e32 v8, v8, v8 |
| ; GFX908-NEXT: v_add_u32_e32 v7, v7, v7 |
| ; GFX908-NEXT: v_add_u32_e32 v6, v6, v6 |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_add_u32_e32 v32, v32, v32 |
| ; GFX908-NEXT: v_add_u32_e32 v31, v31, v31 |
| ; GFX908-NEXT: v_add_u32_e32 v30, v30, v30 |
| ; GFX908-NEXT: v_add_u32_e32 v29, v29, v29 |
| ; GFX908-NEXT: v_add_u32_e32 v5, v5, v5 |
| ; GFX908-NEXT: v_add_u32_e32 v12, v12, v12 |
| ; GFX908-NEXT: v_add_u32_e32 v11, v11, v11 |
| ; GFX908-NEXT: v_add_u32_e32 v10, v10, v10 |
| ; GFX908-NEXT: v_add_u32_e32 v9, v9, v9 |
| ; GFX908-NEXT: v_add_u32_e32 v16, v16, v16 |
| ; GFX908-NEXT: v_add_u32_e32 v15, v15, v15 |
| ; GFX908-NEXT: v_add_u32_e32 v14, v14, v14 |
| ; GFX908-NEXT: v_add_u32_e32 v13, v13, v13 |
| ; GFX908-NEXT: v_add_u32_e32 v20, v20, v20 |
| ; GFX908-NEXT: v_add_u32_e32 v19, v19, v19 |
| ; GFX908-NEXT: v_add_u32_e32 v18, v18, v18 |
| ; GFX908-NEXT: v_add_u32_e32 v17, v17, v17 |
| ; GFX908-NEXT: v_add_u32_e32 v24, v24, v24 |
| ; GFX908-NEXT: v_add_u32_e32 v23, v23, v23 |
| ; GFX908-NEXT: v_add_u32_e32 v22, v22, v22 |
| ; GFX908-NEXT: v_add_u32_e32 v21, v21, v21 |
| ; GFX908-NEXT: v_add_u32_e32 v28, v28, v28 |
| ; GFX908-NEXT: v_add_u32_e32 v27, v27, v27 |
| ; GFX908-NEXT: v_add_u32_e32 v26, v26, v26 |
| ; GFX908-NEXT: v_add_u32_e32 v25, v25, v25 |
| ; GFX908-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96 |
| ; GFX908-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112 |
| ; GFX908-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64 |
| ; GFX908-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 |
| ; GFX908-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32 |
| ; GFX908-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48 |
| ; GFX908-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] |
| ; GFX908-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 |
| ; GFX908-NEXT: s_endpgm |
| ; |
| ; GFX90A-LABEL: no_agpr_no_reserve: |
| ; GFX90A: ; %bb.0: |
| ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 |
| ; GFX90A-NEXT: v_lshlrev_b32_e32 v32, 7, v0 |
| ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX90A-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 |
| ; GFX90A-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] |
| ; GFX90A-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48 |
| ; GFX90A-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:32 |
| ; GFX90A-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 |
| ; GFX90A-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 |
| ; GFX90A-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 |
| ; GFX90A-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 |
| ; GFX90A-NEXT: s_waitcnt vmcnt(7) |
| ; GFX90A-NEXT: v_add_u32_e32 v3, v3, v3 |
| ; GFX90A-NEXT: v_add_u32_e32 v2, v2, v2 |
| ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v1 |
| ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v0 |
| ; GFX90A-NEXT: s_waitcnt vmcnt(6) |
| ; GFX90A-NEXT: v_add_u32_e32 v7, v7, v7 |
| ; GFX90A-NEXT: v_add_u32_e32 v6, v6, v6 |
| ; GFX90A-NEXT: v_add_u32_e32 v5, v5, v5 |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: v_add_u32_e32 v31, v31, v31 |
| ; GFX90A-NEXT: v_add_u32_e32 v30, v30, v30 |
| ; GFX90A-NEXT: v_add_u32_e32 v29, v29, v29 |
| ; GFX90A-NEXT: v_add_u32_e32 v28, v28, v28 |
| ; GFX90A-NEXT: v_add_u32_e32 v4, v4, v4 |
| ; GFX90A-NEXT: v_add_u32_e32 v11, v11, v11 |
| ; GFX90A-NEXT: v_add_u32_e32 v10, v10, v10 |
| ; GFX90A-NEXT: v_add_u32_e32 v9, v9, v9 |
| ; GFX90A-NEXT: v_add_u32_e32 v8, v8, v8 |
| ; GFX90A-NEXT: v_add_u32_e32 v15, v15, v15 |
| ; GFX90A-NEXT: v_add_u32_e32 v14, v14, v14 |
| ; GFX90A-NEXT: v_add_u32_e32 v13, v13, v13 |
| ; GFX90A-NEXT: v_add_u32_e32 v12, v12, v12 |
| ; GFX90A-NEXT: v_add_u32_e32 v19, v19, v19 |
| ; GFX90A-NEXT: v_add_u32_e32 v18, v18, v18 |
| ; GFX90A-NEXT: v_add_u32_e32 v17, v17, v17 |
| ; GFX90A-NEXT: v_add_u32_e32 v16, v16, v16 |
| ; GFX90A-NEXT: v_add_u32_e32 v23, v23, v23 |
| ; GFX90A-NEXT: v_add_u32_e32 v22, v22, v22 |
| ; GFX90A-NEXT: v_add_u32_e32 v21, v21, v21 |
| ; GFX90A-NEXT: v_add_u32_e32 v20, v20, v20 |
| ; GFX90A-NEXT: v_add_u32_e32 v27, v27, v27 |
| ; GFX90A-NEXT: v_add_u32_e32 v26, v26, v26 |
| ; GFX90A-NEXT: v_add_u32_e32 v25, v25, v25 |
| ; GFX90A-NEXT: v_add_u32_e32 v24, v24, v24 |
| ; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96 |
| ; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112 |
| ; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:64 |
| ; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:80 |
| ; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:32 |
| ; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48 |
| ; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] |
| ; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 |
| ; GFX90A-NEXT: s_endpgm |
| %id = call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <32 x i32>, <32 x i32> addrspace(1)* %arg, i32 %id |
| %load = load <32 x i32>, <32 x i32> addrspace(1)* %gep |
| %add = add <32 x i32> %load, %load |
| store <32 x i32> %add, <32 x i32> addrspace(1)* %gep |
| ret void |
| } |
| |
| ; FIXME: This case is broken. The asm value passed in v32 is live |
| ; through the range where the reserved def for the copy is introduced, |
| ; clobbering the user value. |
| define void @v32_asm_def_use(float %v0, float %v1) #0 { |
| ; GFX908-LABEL: v32_asm_def_use: |
| ; GFX908: ; %bb.0: |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX908-NEXT: v_mov_b32_e32 v33, v1 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, v0 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; def v[0:31] a[0:15] |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a15 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; def v32 |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a14 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a13 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a12 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a11 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a10 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a9 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a8 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a6 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a5 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a4 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a3 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a2 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a1 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v35 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v35 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; copy |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a1 |
| ; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31] |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a32, v35 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; copy |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v33, a2 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v33 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; use a3 v[0:31] |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; use v32 |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX90A-LABEL: v32_asm_def_use: |
| ; GFX90A: ; %bb.0: |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-NEXT: v_mov_b32_e32 v34, v0 |
| ; GFX90A-NEXT: v_mov_b32_e32 v33, v1 |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; def v[0:31] a[0:15] |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a15 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a14 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a13 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a12 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a11 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a10 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a9 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a8 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a7 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a6 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a5 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a4 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a3 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a2 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a1 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; def v32 |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; copy |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v35, a32 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a32, a1 |
| ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31] |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; copy |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a32, v35 ; Reload Reuse |
| ; GFX90A-NEXT: s_nop 7 |
| ; GFX90A-NEXT: s_nop 1 |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2 |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; use a3 v[0:31] |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; use v32 |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: s_setpc_b64 s[30:31] |
| %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${a[0:15]}"() |
| %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0 |
| %agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1 |
| %mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0) |
| %v32 = call i32 asm sideeffect "; def $0","=${v32}"() |
| %agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0) |
| %agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma) |
| call void asm sideeffect "; use $0 $1","{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0) |
| call void asm sideeffect "; use $0","${v32}"(i32 %v32) |
| ret void |
| } |
| |
| define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 { |
| ; GFX908-LABEL: introduced_copy_to_sgpr: |
| ; GFX908: ; %bb.0: ; %bb |
| ; GFX908-NEXT: global_load_ushort v24, v[0:1], off glc |
| ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; GFX908-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x10 |
| ; GFX908-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX908-NEXT: s_load_dword s5, s[4:5], 0x18 |
| ; GFX908-NEXT: s_mov_b32 s4, 0 |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 |
| ; GFX908-NEXT: s_sub_i32 s6, 0, s3 |
| ; GFX908-NEXT: s_lshl_b64 s[8:9], s[10:11], 5 |
| ; GFX908-NEXT: s_lshr_b32 s12, s5, 16 |
| ; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 |
| ; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s5 |
| ; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s12 |
| ; GFX908-NEXT: s_or_b32 s8, s8, 28 |
| ; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 |
| ; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 |
| ; GFX908-NEXT: v_mov_b32_e32 v6, s10 |
| ; GFX908-NEXT: v_mov_b32_e32 v7, s11 |
| ; GFX908-NEXT: v_mul_lo_u32 v2, s6, v0 |
| ; GFX908-NEXT: s_lshl_b64 s[6:7], s[0:1], 5 |
| ; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2 |
| ; GFX908-NEXT: v_add_u32_e32 v0, v0, v2 |
| ; GFX908-NEXT: v_mul_hi_u32 v0, s2, v0 |
| ; GFX908-NEXT: v_mov_b32_e32 v2, s8 |
| ; GFX908-NEXT: v_mov_b32_e32 v3, s9 |
| ; GFX908-NEXT: v_mul_lo_u32 v4, v0, s3 |
| ; GFX908-NEXT: v_add_u32_e32 v5, 1, v0 |
| ; GFX908-NEXT: v_sub_u32_e32 v4, s2, v4 |
| ; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 |
| ; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc |
| ; GFX908-NEXT: v_subrev_u32_e32 v5, s3, v4 |
| ; GFX908-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc |
| ; GFX908-NEXT: v_add_u32_e32 v5, 1, v0 |
| ; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 |
| ; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc |
| ; GFX908-NEXT: v_lshlrev_b64 v[4:5], 5, v[0:1] |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_readfirstlane_b32 s2, v24 |
| ; GFX908-NEXT: s_and_b32 s2, 0xffff, s2 |
| ; GFX908-NEXT: s_mul_i32 s1, s1, s2 |
| ; GFX908-NEXT: s_mul_hi_u32 s3, s0, s2 |
| ; GFX908-NEXT: s_mul_i32 s0, s0, s2 |
| ; GFX908-NEXT: s_add_i32 s1, s3, s1 |
| ; GFX908-NEXT: s_lshl_b64 s[8:9], s[0:1], 5 |
| ; GFX908-NEXT: s_branch .LBB3_2 |
| ; GFX908-NEXT: .LBB3_1: ; %bb12 |
| ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 |
| ; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 |
| ; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc |
| ; GFX908-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 |
| ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc |
| ; GFX908-NEXT: .LBB3_2: ; %bb9 |
| ; GFX908-NEXT: ; =>This Loop Header: Depth=1 |
| ; GFX908-NEXT: ; Child Loop BB3_5 Depth 2 |
| ; GFX908-NEXT: s_cbranch_scc0 .LBB3_1 |
| ; GFX908-NEXT: ; %bb.3: ; %bb14 |
| ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 |
| ; GFX908-NEXT: v_mov_b32_e32 v8, 0 |
| ; GFX908-NEXT: v_mov_b32_e32 v9, 0 |
| ; GFX908-NEXT: global_load_dwordx2 v[8:9], v[8:9], off |
| ; GFX908-NEXT: s_mov_b32 s5, s4 |
| ; GFX908-NEXT: v_mov_b32_e32 v13, s5 |
| ; GFX908-NEXT: v_mov_b32_e32 v15, s5 |
| ; GFX908-NEXT: v_mov_b32_e32 v17, s5 |
| ; GFX908-NEXT: v_mov_b32_e32 v12, s4 |
| ; GFX908-NEXT: v_mov_b32_e32 v14, s4 |
| ; GFX908-NEXT: v_mov_b32_e32 v16, s4 |
| ; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[6:7] |
| ; GFX908-NEXT: v_mov_b32_e32 v11, v3 |
| ; GFX908-NEXT: v_mov_b32_e32 v19, v13 |
| ; GFX908-NEXT: v_mov_b32_e32 v10, v2 |
| ; GFX908-NEXT: v_mov_b32_e32 v18, v12 |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_readfirstlane_b32 s2, v8 |
| ; GFX908-NEXT: v_readfirstlane_b32 s3, v9 |
| ; GFX908-NEXT: s_add_u32 s2, s2, 1 |
| ; GFX908-NEXT: s_addc_u32 s3, s3, 0 |
| ; GFX908-NEXT: s_mul_hi_u32 s5, s6, s2 |
| ; GFX908-NEXT: s_mul_i32 s11, s7, s2 |
| ; GFX908-NEXT: s_mul_i32 s10, s6, s2 |
| ; GFX908-NEXT: s_mul_i32 s2, s6, s3 |
| ; GFX908-NEXT: s_add_i32 s2, s5, s2 |
| ; GFX908-NEXT: s_add_i32 s5, s2, s11 |
| ; GFX908-NEXT: s_branch .LBB3_5 |
| ; GFX908-NEXT: .LBB3_4: ; %bb58 |
| ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 |
| ; GFX908-NEXT: v_add_co_u32_sdwa v8, vcc, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 |
| ; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc |
| ; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9] |
| ; GFX908-NEXT: v_mov_b32_e32 v20, s9 |
| ; GFX908-NEXT: v_add_co_u32_e64 v10, s[2:3], s8, v10 |
| ; GFX908-NEXT: v_addc_co_u32_e64 v11, s[2:3], v11, v20, s[2:3] |
| ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 |
| ; GFX908-NEXT: .LBB3_5: ; %bb16 |
| ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 |
| ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 |
| ; GFX908-NEXT: v_mov_b32_e32 v21, s5 |
| ; GFX908-NEXT: v_add_co_u32_e32 v20, vcc, s10, v10 |
| ; GFX908-NEXT: v_addc_co_u32_e32 v21, vcc, v11, v21, vcc |
| ; GFX908-NEXT: global_load_dword v28, v[20:21], off offset:-12 glc |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: global_load_dword v27, v[20:21], off offset:-8 glc |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: global_load_dword v22, v[20:21], off offset:-4 glc |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: global_load_dword v20, v[20:21], off glc |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: ds_read_b64 v[20:21], v1 |
| ; GFX908-NEXT: ds_read_b64 v[22:23], v0 |
| ; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] |
| ; GFX908-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX908-NEXT: s_cbranch_vccnz .LBB3_4 |
| ; GFX908-NEXT: ; %bb.6: ; %bb51 |
| ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 |
| ; GFX908-NEXT: v_cvt_f32_f16_sdwa v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 |
| ; GFX908-NEXT: v_cvt_f32_f16_e32 v28, v28 |
| ; GFX908-NEXT: v_cvt_f32_f16_sdwa v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 |
| ; GFX908-NEXT: v_cvt_f32_f16_e32 v27, v27 |
| ; GFX908-NEXT: v_add_f32_e32 v31, v25, v20 |
| ; GFX908-NEXT: v_add_f32_e32 v32, v26, v21 |
| ; GFX908-NEXT: v_add_f32_e32 v33, 0, v20 |
| ; GFX908-NEXT: v_add_f32_e32 v34, 0, v21 |
| ; GFX908-NEXT: v_add_f32_e32 v23, v29, v23 |
| ; GFX908-NEXT: v_add_f32_e32 v22, v28, v22 |
| ; GFX908-NEXT: v_add_f32_e32 v21, v30, v21 |
| ; GFX908-NEXT: v_add_f32_e32 v20, v27, v20 |
| ; GFX908-NEXT: v_add_f32_e32 v13, v13, v32 |
| ; GFX908-NEXT: v_add_f32_e32 v12, v12, v31 |
| ; GFX908-NEXT: v_add_f32_e32 v15, v15, v34 |
| ; GFX908-NEXT: v_add_f32_e32 v14, v14, v33 |
| ; GFX908-NEXT: v_add_f32_e32 v16, v16, v22 |
| ; GFX908-NEXT: v_add_f32_e32 v17, v17, v23 |
| ; GFX908-NEXT: v_add_f32_e32 v18, v18, v20 |
| ; GFX908-NEXT: v_add_f32_e32 v19, v19, v21 |
| ; GFX908-NEXT: s_branch .LBB3_4 |
| ; |
| ; GFX90A-LABEL: introduced_copy_to_sgpr: |
| ; GFX90A: ; %bb.0: ; %bb |
| ; GFX90A-NEXT: global_load_ushort v28, v[0:1], off glc |
| ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x10 |
| ; GFX90A-NEXT: s_load_dword s7, s[4:5], 0x18 |
| ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX90A-NEXT: s_mov_b32 s6, 0 |
| ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 |
| ; GFX90A-NEXT: s_sub_i32 s12, 0, s3 |
| ; GFX90A-NEXT: s_lshr_b32 s13, s7, 16 |
| ; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s7 |
| ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 |
| ; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s13 |
| ; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5 |
| ; GFX90A-NEXT: s_lshl_b64 s[10:11], s[8:9], 5 |
| ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 |
| ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 |
| ; GFX90A-NEXT: s_or_b32 s10, s10, 28 |
| ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[8:9], s[8:9] op_sel:[0,1] |
| ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1] |
| ; GFX90A-NEXT: v_mul_lo_u32 v8, s12, v0 |
| ; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 |
| ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v8 |
| ; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 |
| ; GFX90A-NEXT: v_mul_lo_u32 v8, v0, s3 |
| ; GFX90A-NEXT: v_sub_u32_e32 v8, s2, v8 |
| ; GFX90A-NEXT: v_add_u32_e32 v9, 1, v0 |
| ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v8 |
| ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc |
| ; GFX90A-NEXT: v_subrev_u32_e32 v9, s3, v8 |
| ; GFX90A-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc |
| ; GFX90A-NEXT: v_add_u32_e32 v9, 1, v0 |
| ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v8 |
| ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc |
| ; GFX90A-NEXT: v_lshlrev_b64 v[8:9], 5, v[0:1] |
| ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], 0, 0 |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: v_readfirstlane_b32 s2, v28 |
| ; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2 |
| ; GFX90A-NEXT: s_mul_i32 s1, s1, s2 |
| ; GFX90A-NEXT: s_mul_hi_u32 s3, s0, s2 |
| ; GFX90A-NEXT: s_mul_i32 s0, s0, s2 |
| ; GFX90A-NEXT: s_add_i32 s1, s3, s1 |
| ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 |
| ; GFX90A-NEXT: s_branch .LBB3_2 |
| ; GFX90A-NEXT: .LBB3_1: ; %bb12 |
| ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 |
| ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc |
| ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v9, vcc |
| ; GFX90A-NEXT: .LBB3_2: ; %bb9 |
| ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 |
| ; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2 |
| ; GFX90A-NEXT: s_cbranch_scc0 .LBB3_1 |
| ; GFX90A-NEXT: ; %bb.3: ; %bb14 |
| ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 |
| ; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[10:11], off |
| ; GFX90A-NEXT: s_mov_b32 s7, s6 |
| ; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1] |
| ; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1] |
| ; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[6:7], s[6:7] op_sel:[0,1] |
| ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[4:5] |
| ; GFX90A-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] |
| ; GFX90A-NEXT: v_pk_mov_b32 v[22:23], v[16:17], v[16:17] op_sel:[0,1] |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: v_readfirstlane_b32 s7, v12 |
| ; GFX90A-NEXT: v_readfirstlane_b32 s8, v13 |
| ; GFX90A-NEXT: s_add_u32 s7, s7, 1 |
| ; GFX90A-NEXT: s_addc_u32 s9, s8, 0 |
| ; GFX90A-NEXT: s_mul_hi_u32 s10, s4, s7 |
| ; GFX90A-NEXT: s_mul_i32 s11, s5, s7 |
| ; GFX90A-NEXT: s_mul_i32 s8, s4, s7 |
| ; GFX90A-NEXT: s_mul_i32 s7, s4, s9 |
| ; GFX90A-NEXT: s_add_i32 s7, s10, s7 |
| ; GFX90A-NEXT: s_add_i32 s7, s7, s11 |
| ; GFX90A-NEXT: s_branch .LBB3_5 |
| ; GFX90A-NEXT: .LBB3_4: ; %bb58 |
| ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 |
| ; GFX90A-NEXT: v_add_co_u32_sdwa v12, vcc, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc |
| ; GFX90A-NEXT: v_mov_b32_e32 v24, s3 |
| ; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s2, v14 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v24, vcc |
| ; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[12:13] |
| ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 |
| ; GFX90A-NEXT: .LBB3_5: ; %bb16 |
| ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 |
| ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 |
| ; GFX90A-NEXT: v_mov_b32_e32 v25, s7 |
| ; GFX90A-NEXT: v_add_co_u32_e32 v24, vcc, s8, v14 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v25, vcc, v15, v25, vcc |
| ; GFX90A-NEXT: global_load_dword v30, v[24:25], off offset:-12 glc |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: global_load_dword v29, v[24:25], off offset:-8 glc |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: global_load_dword v26, v[24:25], off offset:-4 glc |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: global_load_dword v26, v[24:25], off glc |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: ; kill: killed $vgpr24 killed $vgpr25 |
| ; GFX90A-NEXT: ds_read_b64 v[24:25], v1 |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: ds_read_b64 v[26:27], v0 |
| ; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1] |
| ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_4 |
| ; GFX90A-NEXT: ; %bb.6: ; %bb51 |
| ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 |
| ; GFX90A-NEXT: v_cvt_f32_f16_sdwa v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 |
| ; GFX90A-NEXT: v_cvt_f32_f16_e32 v30, v30 |
| ; GFX90A-NEXT: v_cvt_f32_f16_sdwa v33, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 |
| ; GFX90A-NEXT: v_cvt_f32_f16_e32 v32, v29 |
| ; GFX90A-NEXT: v_pk_add_f32 v[34:35], v[2:3], v[24:25] |
| ; GFX90A-NEXT: v_pk_add_f32 v[36:37], v[24:25], 0 op_sel_hi:[1,0] |
| ; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[30:31], v[26:27] |
| ; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[32:33], v[24:25] |
| ; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[34:35] |
| ; GFX90A-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[36:37] |
| ; GFX90A-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[26:27] |
| ; GFX90A-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[24:25] |
| ; GFX90A-NEXT: s_branch .LBB3_4 |
| bb: |
| %i = load volatile i16, i16 addrspace(4)* undef, align 2 |
| %i6 = zext i16 %i to i64 |
| %i7 = udiv i32 %arg1, %arg2 |
| %i8 = zext i32 %i7 to i64 |
| br label %bb9 |
| |
| bb9: ; preds = %bb12, %bb |
| %i10 = phi i64 [ %arg3, %bb ], [ %i13, %bb12 ] |
| br i1 undef, label %bb14, label %bb12 |
| |
| bb12: ; preds = %bb58, %bb9 |
| %i13 = add nuw nsw i64 %i10, %i8 |
| br label %bb9 |
| |
| bb14: ; preds = %bb9 |
| %i11 = icmp slt i64 %i10, 0 |
| %i15 = load i64, i64 addrspace(1)* null, align 8 |
| br label %bb16 |
| |
| bb16: ; preds = %bb58, %bb14 |
| %i17 = phi i64 [ %i65, %bb58 ], [ %i15, %bb14 ] |
| %i18 = phi <2 x float> [ %i59, %bb58 ], [ zeroinitializer, %bb14 ] |
| %i19 = phi <2 x float> [ %i60, %bb58 ], [ zeroinitializer, %bb14 ] |
| %i20 = phi <2 x float> [ %i61, %bb58 ], [ zeroinitializer, %bb14 ] |
| %i21 = phi <2 x float> [ %i62, %bb58 ], [ zeroinitializer, %bb14 ] |
| %i22 = add nsw i64 %i17, 1 |
| %i23 = mul nsw i64 %i22, %arg |
| %i24 = add nsw i64 %i23, %i10 |
| %i25 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 8 |
| %i26 = bitcast half addrspace(1)* %i25 to <2 x half> addrspace(1)* |
| %i27 = load volatile <2 x half>, <2 x half> addrspace(1)* %i26, align 16 |
| %i28 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 10 |
| %i29 = bitcast half addrspace(1)* %i28 to <2 x half> addrspace(1)* |
| %i30 = load volatile <2 x half>, <2 x half> addrspace(1)* %i29, align 4 |
| %i31 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 12 |
| %i32 = bitcast half addrspace(1)* %i31 to <2 x half> addrspace(1)* |
| %i33 = load volatile <2 x half>, <2 x half> addrspace(1)* %i32, align 8 |
| %i34 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 14 |
| %i35 = bitcast half addrspace(1)* %i34 to <2 x half> addrspace(1)* |
| %i36 = load volatile <2 x half>, <2 x half> addrspace(1)* %i35, align 4 |
| %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8 |
| %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32 |
| fence syncscope("workgroup") acquire |
| br i1 %i11, label %bb58, label %bb51 |
| |
| bb51: ; preds = %bb16 |
| %i37 = fpext <2 x half> %arg4 to <2 x float> |
| %i39 = fpext <2 x half> %i27 to <2 x float> |
| %i40 = fpext <2 x half> %i30 to <2 x float> |
| %i41 = fpext <2 x half> %i33 to <2 x float> |
| %i42 = fpext <2 x half> %i36 to <2 x float> |
| %i44 = fadd contract <2 x float> %i37, %i43 |
| %i45 = fadd contract <2 x float> %i43, zeroinitializer |
| %i47 = fadd contract <2 x float> %i39, %i46 |
| %i48 = fadd contract <2 x float> %i40, %i43 |
| %i49 = fadd contract <2 x float> %i41, zeroinitializer |
| %i50 = fadd contract <2 x float> %i42, zeroinitializer |
| %i52 = fadd contract <2 x float> %i18, %i44 |
| %i53 = fadd contract <2 x float> %i19, %i45 |
| %i54 = fadd contract <2 x float> %i20, %i47 |
| %i55 = fadd contract <2 x float> %i21, %i48 |
| %i56 = fadd contract <2 x float> %i49, zeroinitializer |
| %i57 = fadd contract <2 x float> %i50, zeroinitializer |
| br label %bb58 |
| |
| bb58: ; preds = %bb51, %bb16 |
| %i59 = phi <2 x float> [ %i18, %bb16 ], [ %i52, %bb51 ] |
| %i60 = phi <2 x float> [ %i19, %bb16 ], [ %i53, %bb51 ] |
| %i61 = phi <2 x float> [ %i20, %bb16 ], [ %i54, %bb51 ] |
| %i62 = phi <2 x float> [ %i21, %bb16 ], [ %i55, %bb51 ] |
| %i63 = phi <2 x float> [ zeroinitializer, %bb16 ], [ %i56, %bb51 ] |
| %i64 = phi <2 x float> [ zeroinitializer, %bb16 ], [ %i57, %bb51 ] |
| %i65 = add nsw i64 %i17, %i6 |
| %i66 = icmp slt i64 %i65, 0 |
| br i1 %i66, label %bb16, label %bb12 |
| } |
| |
| ; This testcase would fail on GFX908 due to not having a free VGPR available to |
| ; copy SGPR to AGPR. |
| define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 { |
| ; GFX908-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy: |
| ; GFX908: ; %bb.0: |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX908-NEXT: v_mov_b32_e32 v32, v1 |
| ; GFX908-NEXT: v_mov_b32_e32 v33, v0 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; def v[0:31] s[0:15] |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s15 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a31, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s14 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a30, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s13 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a29, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s12 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a28, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s11 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a27, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s10 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a26, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s9 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a25, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s8 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a24, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a23, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s6 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a22, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s5 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a21, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s4 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a20, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s3 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a19, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s2 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a18, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s1 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a17, v34 |
| ; GFX908-NEXT: v_mov_b32_e32 v34, s0 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a16, v34 |
| ; GFX908-NEXT: s_nop 0 |
| ; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31] |
| ; GFX908-NEXT: s_nop 7 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a0 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a1 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_read_b32 v36, a13 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_read_b32 v35, a14 ; Reload Reuse |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a2 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a3 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a4 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a5 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a6 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a7 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a8 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a9 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a10 ; Reload Reuse |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a15 ; Reload Reuse |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; copy |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: v_accvgpr_read_b32 v34, a1 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a32, v34 |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a0, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a1, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a2, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a4, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a5, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a6, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a7, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a8, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a9, v34 ; Reload Reuse |
| ; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| ; GFX908-NEXT: s_waitcnt vmcnt(0) |
| ; GFX908-NEXT: v_accvgpr_write_b32 a10, v34 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_write_b32 a14, v35 ; Reload Reuse |
| ; GFX908-NEXT: v_accvgpr_write_b32 a15, v34 ; Reload Reuse |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; copy |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 |
| ; GFX908-NEXT: s_nop 1 |
| ; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 |
| ; GFX908-NEXT: ;;#ASMSTART |
| ; GFX908-NEXT: ; use a3 v[0:31] |
| ; GFX908-NEXT: ;;#ASMEND |
| ; GFX908-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX90A-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy: |
| ; GFX90A: ; %bb.0: |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-NEXT: v_mov_b32_e32 v33, v0 |
| ; GFX90A-NEXT: v_mov_b32_e32 v32, v1 |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; def v[0:31] s[0:15] |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31] |
| ; GFX90A-NEXT: s_nop 7 |
| ; GFX90A-NEXT: s_nop 2 |
| ; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: buffer_store_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v39, a11 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v38, a12 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v37, a13 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v36, a14 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_read_b32 v35, a15 ; Reload Reuse |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; copy |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a32, a1 |
| ; GFX90A-NEXT: buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_nop 0 |
| ; GFX90A-NEXT: buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a14, v36 ; Reload Reuse |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a15, v35 ; Reload Reuse |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; copy |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2 |
| ; GFX90A-NEXT: ;;#ASMSTART |
| ; GFX90A-NEXT: ; use a3 v[0:31] |
| ; GFX90A-NEXT: ;;#ASMEND |
| ; GFX90A-NEXT: v_accvgpr_write_b32 a32, v34 ; Reload Reuse |
| ; GFX90A-NEXT: s_setpc_b64 s[30:31] |
| %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${s[0:15]}"() |
| %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0 |
| %agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1 |
| %mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0) |
| %agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0) |
| %agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma) |
| call void asm sideeffect "; use $0 $1","{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0) |
| ret void |
| } |
| |
| declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1 |
| declare i32 @llvm.amdgcn.workitem.id.x() #2 |
| |
| attributes #0 = { "amdgpu-waves-per-eu"="6,6" } |
| attributes #1 = { convergent nounwind readnone willreturn } |
| attributes #2 = { nounwind readnone willreturn } |
| attributes #3 = { "amdgpu-waves-per-eu"="7,7" } |