| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -o - %s | FileCheck -check-prefix=CHECK %s |
| |
| define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" { |
| ; CHECK-LABEL: max_6_vgprs: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; CHECK-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo |
| ; CHECK-NEXT: global_load_b32 v5, v[0:1], off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:160 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v0, off offset:12 ; 4-byte Folded Spill |
| ; CHECK-NEXT: ;;#ASMSTART |
| ; CHECK-NEXT: ;;#ASMEND |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v5, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: s_endpgm |
| %tid = load volatile i32, ptr addrspace(1) poison |
| %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid |
| %p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4 |
| %p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8 |
| %p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12 |
| %p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16 |
| %v1 = load volatile i32, ptr addrspace(1) %p1 |
| %v2 = load volatile i32, ptr addrspace(1) %p2 |
| %v3 = load volatile i32, ptr addrspace(1) %p3 |
| %v4 = load volatile i32, ptr addrspace(1) %p4 |
| %v5 = load volatile i32, ptr addrspace(1) %p5 |
| call void asm sideeffect "", "~{v[0:4]}" () |
| store volatile i32 %v1, ptr addrspace(1) poison |
| store volatile i32 %v2, ptr addrspace(1) poison |
| store volatile i32 %v3, ptr addrspace(1) poison |
| store volatile i32 %v4, ptr addrspace(1) poison |
| store volatile i32 %v5, ptr addrspace(1) poison |
| ret void |
| } |
| |
| define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgpu-num-vgpr"="11" { |
| ; CHECK-LABEL: max_11_vgprs_branch: |
| ; CHECK: ; %bb.0: ; %.entry |
| ; CHECK-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: s_mov_b32 s0, exec_lo |
| ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; CHECK-NEXT: v_lshlrev_b64_e32 v[3:4], 2, v[3:4] |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 |
| ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo |
| ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:336 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:448 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:576 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v3, off ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:720 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: v_cmpx_eq_u32_e32 0, v2 |
| ; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0 |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_2 |
| ; CHECK-NEXT: ; %bb.1: ; %.false |
| ; CHECK-NEXT: global_load_b32 v10, v[0:1], off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill |
| ; CHECK-NEXT: ;;#ASMSTART |
| ; CHECK-NEXT: ;;#ASMEND |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v10, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0 |
| ; CHECK-NEXT: ; kill: killed $vgpr0 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0 |
| ; CHECK-NEXT: ; kill: killed $vgpr0 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 |
| ; CHECK-NEXT: .LBB1_2: ; %Flow |
| ; CHECK-NEXT: s_and_not1_saveexec_b32 s0, s0 |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_4 |
| ; CHECK-NEXT: ; %bb.3: ; %.true |
| ; CHECK-NEXT: global_load_b32 v10, v[0:1], off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill |
| ; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill |
| ; CHECK-NEXT: ;;#ASMSTART |
| ; CHECK-NEXT: ;;#ASMEND |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v10, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: .LBB1_4: ; %.exit |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS |
| ; CHECK-NEXT: s_wait_storecnt 0x0 |
| ; CHECK-NEXT: s_endpgm |
| .entry: |
| %tid = load volatile i32, ptr addrspace(1) poison |
| %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid |
| %p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4 |
| %p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8 |
| %p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12 |
| %p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16 |
| %p6 = getelementptr inbounds i32, ptr addrspace(1) %p5, i32 20 |
| %p7 = getelementptr inbounds i32, ptr addrspace(1) %p6, i32 24 |
| %p8 = getelementptr inbounds i32, ptr addrspace(1) %p7, i32 28 |
| %p9 = getelementptr inbounds i32, ptr addrspace(1) %p8, i32 32 |
| %p10 = getelementptr inbounds i32, ptr addrspace(1) %p9, i32 36 |
| %v7 = load volatile i32, ptr addrspace(1) %p7 |
| %v8 = load volatile i32, ptr addrspace(1) %p8 |
| %v9 = load volatile i32, ptr addrspace(1) %p9 |
| %v10 = load volatile i32, ptr addrspace(1) %p10 |
| %cmp = icmp ne i32 %tmp, 0 |
| br i1 %cmp, label %.true, label %.false |
| |
| .true: |
| %v1_t = load volatile i32, ptr addrspace(1) %p1 |
| %v2_t = load volatile i32, ptr addrspace(1) %p2 |
| %v3_t = load volatile i32, ptr addrspace(1) %p3 |
| %v4_t = load volatile i32, ptr addrspace(1) %p4 |
| %v5_t = load volatile i32, ptr addrspace(1) %p5 |
| %v6_t = load volatile i32, ptr addrspace(1) %p6 |
| call void asm sideeffect "", "~{v[0:9]}" () |
| store volatile i32 %v1_t, ptr addrspace(1) poison |
| store volatile i32 %v2_t, ptr addrspace(1) poison |
| store volatile i32 %v3_t, ptr addrspace(1) poison |
| store volatile i32 %v4_t, ptr addrspace(1) poison |
| store volatile i32 %v5_t, ptr addrspace(1) poison |
| store volatile i32 %v6_t, ptr addrspace(1) poison |
| store volatile i32 %v7, ptr addrspace(1) poison |
| store volatile i32 %v8, ptr addrspace(1) poison |
| |
| br label %.exit |
| |
| .false: |
| %v1_f = load volatile i32, ptr addrspace(1) %p1 |
| %v2_f = load volatile i32, ptr addrspace(1) %p2 |
| %v3_f = load volatile i32, ptr addrspace(1) %p3 |
| %v4_f = load volatile i32, ptr addrspace(1) %p4 |
| %v5_f = load volatile i32, ptr addrspace(1) %p5 |
| %v6_f = load volatile i32, ptr addrspace(1) %p6 |
| call void asm sideeffect "", "~{v[0:9]}" () |
| store volatile i32 %v1_f, ptr addrspace(1) poison |
| store volatile i32 %v2_f, ptr addrspace(1) poison |
| store volatile i32 %v3_f, ptr addrspace(1) poison |
| store volatile i32 %v4_f, ptr addrspace(1) poison |
| store volatile i32 %v5_f, ptr addrspace(1) poison |
| store volatile i32 %v6_f, ptr addrspace(1) poison |
| store volatile i32 %v7, ptr addrspace(1) poison |
| store volatile i32 %v8, ptr addrspace(1) poison |
| |
| br label %.exit |
| |
| .exit: |
| store volatile i32 %v9, ptr addrspace(1) poison |
| store volatile i32 %v10, ptr addrspace(1) poison |
| ret void |
| } |
| |
| |
| declare i32 @foo() nounwind |
| declare <8 x half> @bar(<32 x i64>) nounwind |
| |
| define <8 x half> @baz() nounwind { |
| ; CHECK-LABEL: baz: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; CHECK-NEXT: s_wait_expcnt 0x0 |
| ; CHECK-NEXT: s_wait_samplecnt 0x0 |
| ; CHECK-NEXT: s_wait_bvhcnt 0x0 |
| ; CHECK-NEXT: s_wait_kmcnt 0x0 |
| ; CHECK-NEXT: s_mov_b32 s0, s33 |
| ; CHECK-NEXT: s_mov_b32 s33, s32 |
| ; CHECK-NEXT: s_or_saveexec_b32 s1, -1 |
| ; CHECK-NEXT: scratch_store_b32 off, v93, s33 offset:404 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; CHECK-NEXT: s_mov_b32 exec_lo, s1 |
| ; CHECK-NEXT: s_clause 0x1f ; 128-byte Folded Spill |
| ; CHECK-NEXT: scratch_store_b32 off, v40, s33 offset:144 |
| ; CHECK-NEXT: scratch_store_b32 off, v41, s33 offset:140 |
| ; CHECK-NEXT: scratch_store_b32 off, v42, s33 offset:136 |
| ; CHECK-NEXT: scratch_store_b32 off, v43, s33 offset:132 |
| ; CHECK-NEXT: scratch_store_b32 off, v44, s33 offset:128 |
| ; CHECK-NEXT: scratch_store_b32 off, v45, s33 offset:124 |
| ; CHECK-NEXT: scratch_store_b32 off, v46, s33 offset:120 |
| ; CHECK-NEXT: scratch_store_b32 off, v47, s33 offset:116 |
| ; CHECK-NEXT: scratch_store_b32 off, v56, s33 offset:112 |
| ; CHECK-NEXT: scratch_store_b32 off, v57, s33 offset:108 |
| ; CHECK-NEXT: scratch_store_b32 off, v58, s33 offset:104 |
| ; CHECK-NEXT: scratch_store_b32 off, v59, s33 offset:100 |
| ; CHECK-NEXT: scratch_store_b32 off, v60, s33 offset:96 |
| ; CHECK-NEXT: scratch_store_b32 off, v61, s33 offset:92 |
| ; CHECK-NEXT: scratch_store_b32 off, v62, s33 offset:88 |
| ; CHECK-NEXT: scratch_store_b32 off, v63, s33 offset:84 |
| ; CHECK-NEXT: scratch_store_b32 off, v72, s33 offset:80 |
| ; CHECK-NEXT: scratch_store_b32 off, v73, s33 offset:76 |
| ; CHECK-NEXT: scratch_store_b32 off, v74, s33 offset:72 |
| ; CHECK-NEXT: scratch_store_b32 off, v75, s33 offset:68 |
| ; CHECK-NEXT: scratch_store_b32 off, v76, s33 offset:64 |
| ; CHECK-NEXT: scratch_store_b32 off, v77, s33 offset:60 |
| ; CHECK-NEXT: scratch_store_b32 off, v78, s33 offset:56 |
| ; CHECK-NEXT: scratch_store_b32 off, v79, s33 offset:52 |
| ; CHECK-NEXT: scratch_store_b32 off, v88, s33 offset:48 |
| ; CHECK-NEXT: scratch_store_b32 off, v89, s33 offset:44 |
| ; CHECK-NEXT: scratch_store_b32 off, v90, s33 offset:40 |
| ; CHECK-NEXT: scratch_store_b32 off, v91, s33 offset:36 |
| ; CHECK-NEXT: scratch_store_b32 off, v92, s33 offset:32 |
| ; CHECK-NEXT: scratch_store_b32 off, v104, s33 offset:28 |
| ; CHECK-NEXT: scratch_store_b32 off, v105, s33 offset:24 |
| ; CHECK-NEXT: scratch_store_b32 off, v106, s33 offset:20 |
| ; CHECK-NEXT: s_clause 0x4 ; 20-byte Folded Spill |
| ; CHECK-NEXT: scratch_store_b32 off, v107, s33 offset:16 |
| ; CHECK-NEXT: scratch_store_b32 off, v108, s33 offset:12 |
| ; CHECK-NEXT: scratch_store_b32 off, v109, s33 offset:8 |
| ; CHECK-NEXT: scratch_store_b32 off, v110, s33 offset:4 |
| ; CHECK-NEXT: scratch_store_b32 off, v111, s33 |
| ; CHECK-NEXT: v_dual_mov_b32 v92, v31 :: v_dual_mov_b32 v1, 0 |
| ; CHECK-NEXT: v_dual_mov_b32 v0, 0x60 :: v_dual_mov_b32 v3, 0 |
| ; CHECK-NEXT: v_dual_mov_b32 v2, 0x50 :: v_dual_mov_b32 v5, 0 |
| ; CHECK-NEXT: v_dual_mov_b32 v4, 64 :: v_dual_mov_b32 v7, 0 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 48 |
| ; CHECK-NEXT: s_clause 0x1 |
| ; CHECK-NEXT: global_load_b128 v[56:59], v[0:1], off |
| ; CHECK-NEXT: global_load_b128 v[104:107], v[2:3], off |
| ; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0 |
| ; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x70 |
| ; CHECK-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, 0x80 |
| ; CHECK-NEXT: s_clause 0x1 |
| ; CHECK-NEXT: global_load_b128 v[108:111], v[4:5], off |
| ; CHECK-NEXT: global_load_b128 v[60:63], v[6:7], off |
| ; CHECK-NEXT: v_mov_b32_e32 v5, 0 |
| ; CHECK-NEXT: s_clause 0x4 |
| ; CHECK-NEXT: global_load_b128 v[72:75], v[0:1], off |
| ; CHECK-NEXT: global_load_b128 v[10:13], v[2:3], off |
| ; CHECK-NEXT: global_load_b128 v[14:17], v[8:9], off |
| ; CHECK-NEXT: global_load_b128 v[18:21], v[8:9], off offset:16 |
| ; CHECK-NEXT: global_load_b128 v[22:25], v[8:9], off offset:32 |
| ; CHECK-NEXT: v_dual_mov_b32 v4, 32 :: v_dual_mov_b32 v7, 0 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 16 |
| ; CHECK-NEXT: s_clause 0x1 |
| ; CHECK-NEXT: global_load_b128 v[76:79], v[4:5], off |
| ; CHECK-NEXT: global_load_b128 v[88:91], v[6:7], off |
| ; CHECK-NEXT: v_writelane_b32 v93, s0, 14 |
| ; CHECK-NEXT: s_addk_co_i32 s32, 0x1a0 |
| ; CHECK-NEXT: s_getpc_b64 s[0:1] |
| ; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; CHECK-NEXT: s_sext_i32_i16 s1, s1 |
| ; CHECK-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12 |
| ; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; CHECK-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24 |
| ; CHECK-NEXT: s_wait_loadcnt 0x5 |
| ; CHECK-NEXT: scratch_store_b128 off, v[10:13], s33 offset:148 ; 16-byte Folded Spill |
| ; CHECK-NEXT: s_wait_loadcnt 0x4 |
| ; CHECK-NEXT: scratch_store_b128 off, v[14:17], s33 offset:164 ; 16-byte Folded Spill |
| ; CHECK-NEXT: s_wait_loadcnt 0x3 |
| ; CHECK-NEXT: scratch_store_b128 off, v[18:21], s33 offset:180 ; 16-byte Folded Spill |
| ; CHECK-NEXT: s_wait_loadcnt 0x2 |
| ; CHECK-NEXT: s_clause 0x4 ; 80-byte Folded Spill |
| ; CHECK-NEXT: scratch_store_b128 off, v[22:25], s33 offset:196 |
| ; CHECK-NEXT: scratch_store_b128 off, v[26:29], s33 offset:212 |
| ; CHECK-NEXT: scratch_store_b128 off, v[30:33], s33 offset:228 |
| ; CHECK-NEXT: scratch_store_b128 off, v[34:37], s33 offset:244 |
| ; CHECK-NEXT: scratch_store_b128 off, v[38:41], s33 offset:260 |
| ; CHECK-NEXT: s_clause 0x4 |
| ; CHECK-NEXT: global_load_b128 v[10:13], v[8:9], off offset:48 |
| ; CHECK-NEXT: global_load_b128 v[14:17], v[8:9], off offset:64 |
| ; CHECK-NEXT: global_load_b128 v[18:21], v[8:9], off offset:80 |
| ; CHECK-NEXT: global_load_b128 v[22:25], v[8:9], off offset:96 |
| ; CHECK-NEXT: global_load_b128 v[26:29], v[8:9], off offset:112 |
| ; CHECK-NEXT: v_writelane_b32 v93, s30, 0 |
| ; CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; CHECK-NEXT: s_wait_loadcnt 0x4 |
| ; CHECK-NEXT: scratch_store_b128 off, v[10:13], s33 offset:276 ; 16-byte Folded Spill |
| ; CHECK-NEXT: s_wait_loadcnt 0x3 |
| ; CHECK-NEXT: scratch_store_b128 off, v[14:17], s33 offset:292 ; 16-byte Folded Spill |
| ; CHECK-NEXT: s_wait_loadcnt 0x2 |
| ; CHECK-NEXT: scratch_store_b128 off, v[18:21], s33 offset:308 ; 16-byte Folded Spill |
| ; CHECK-NEXT: s_wait_loadcnt 0x1 |
| ; CHECK-NEXT: scratch_store_b128 off, v[22:25], s33 offset:324 ; 16-byte Folded Spill |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: s_clause 0x3 ; 64-byte Folded Spill |
| ; CHECK-NEXT: scratch_store_b128 off, v[26:29], s33 offset:340 |
| ; CHECK-NEXT: scratch_store_b128 off, v[30:33], s33 offset:356 |
| ; CHECK-NEXT: scratch_store_b128 off, v[34:37], s33 offset:372 |
| ; CHECK-NEXT: scratch_store_b128 off, v[38:41], s33 offset:388 |
| ; CHECK-NEXT: v_writelane_b32 v93, s31, 1 |
| ; CHECK-NEXT: v_writelane_b32 v93, s34, 2 |
| ; CHECK-NEXT: v_writelane_b32 v93, s35, 3 |
| ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] |
| ; CHECK-NEXT: v_writelane_b32 v93, s36, 4 |
| ; CHECK-NEXT: v_writelane_b32 v93, s37, 5 |
| ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] |
| ; CHECK-NEXT: v_writelane_b32 v93, s38, 6 |
| ; CHECK-NEXT: v_writelane_b32 v93, s39, 7 |
| ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] |
| ; CHECK-NEXT: v_writelane_b32 v93, s48, 8 |
| ; CHECK-NEXT: v_writelane_b32 v93, s49, 9 |
| ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] |
| ; CHECK-NEXT: v_writelane_b32 v93, s50, 10 |
| ; CHECK-NEXT: s_mov_b32 s50, s15 |
| ; CHECK-NEXT: v_writelane_b32 v93, s51, 11 |
| ; CHECK-NEXT: s_mov_b32 s51, s14 |
| ; CHECK-NEXT: v_writelane_b32 v93, s52, 12 |
| ; CHECK-NEXT: s_mov_b32 s52, s13 |
| ; CHECK-NEXT: v_writelane_b32 v93, s53, 13 |
| ; CHECK-NEXT: s_mov_b32 s53, s12 |
| ; CHECK-NEXT: s_wait_kmcnt 0x0 |
| ; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; CHECK-NEXT: s_clause 0x7 ; 128-byte Folded Reload |
| ; CHECK-NEXT: scratch_load_b128 v[0:3], off, s33 offset:276 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:292 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[8:11], off, s33 offset:308 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[12:15], off, s33 offset:324 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[16:19], off, s33 offset:340 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[20:23], off, s33 offset:356 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[24:27], off, s33 offset:372 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[28:31], off, s33 offset:388 th:TH_LOAD_LU |
| ; CHECK-NEXT: s_getpc_b64 s[0:1] |
| ; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; CHECK-NEXT: s_sext_i32_i16 s1, s1 |
| ; CHECK-NEXT: s_add_co_u32 s0, s0, bar@gotpcrel32@lo+12 |
| ; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; CHECK-NEXT: s_add_co_ci_u32 s1, s1, bar@gotpcrel32@hi+24 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] |
| ; CHECK-NEXT: s_mov_b32 s12, s53 |
| ; CHECK-NEXT: s_mov_b32 s13, s52 |
| ; CHECK-NEXT: s_mov_b32 s14, s51 |
| ; CHECK-NEXT: s_mov_b32 s15, s50 |
| ; CHECK-NEXT: s_wait_loadcnt 0x3 |
| ; CHECK-NEXT: scratch_store_b32 off, v19, s32 offset:128 |
| ; CHECK-NEXT: s_wait_loadcnt 0x2 |
| ; CHECK-NEXT: scratch_load_b128 v[19:22], off, s33 offset:148 th:TH_LOAD_LU ; 16-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x2 |
| ; CHECK-NEXT: scratch_load_b128 v[23:26], off, s33 offset:164 th:TH_LOAD_LU ; 16-byte Folded Reload |
| ; CHECK-NEXT: s_wait_loadcnt 0x2 |
| ; CHECK-NEXT: s_clause 0x5 ; 96-byte Folded Reload |
| ; CHECK-NEXT: scratch_load_b128 v[27:30], off, s33 offset:180 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[31:34], off, s33 offset:196 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[35:38], off, s33 offset:212 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[39:42], off, s33 offset:228 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[43:46], off, s33 offset:244 th:TH_LOAD_LU |
| ; CHECK-NEXT: scratch_load_b128 v[47:50], off, s33 offset:260 th:TH_LOAD_LU |
| ; CHECK-NEXT: s_clause 0x3 |
| ; CHECK-NEXT: scratch_store_b128 off, v[15:18], s32 offset:112 |
| ; CHECK-NEXT: scratch_store_b128 off, v[11:14], s32 offset:96 |
| ; CHECK-NEXT: scratch_store_b128 off, v[7:10], s32 offset:80 |
| ; CHECK-NEXT: scratch_store_b128 off, v[3:6], s32 offset:64 |
| ; CHECK-NEXT: v_dual_mov_b32 v4, v88 :: v_dual_mov_b32 v5, v89 |
| ; CHECK-NEXT: v_dual_mov_b32 v6, v90 :: v_dual_mov_b32 v7, v91 |
| ; CHECK-NEXT: v_dual_mov_b32 v8, v76 :: v_dual_mov_b32 v9, v77 |
| ; CHECK-NEXT: v_dual_mov_b32 v10, v78 :: v_dual_mov_b32 v11, v79 |
| ; CHECK-NEXT: v_dual_mov_b32 v12, v60 :: v_dual_mov_b32 v13, v61 |
| ; CHECK-NEXT: v_dual_mov_b32 v14, v62 :: v_dual_mov_b32 v15, v63 |
| ; CHECK-NEXT: v_dual_mov_b32 v16, v108 :: v_dual_mov_b32 v17, v109 |
| ; CHECK-NEXT: v_mov_b32_e32 v18, v110 |
| ; CHECK-NEXT: s_wait_loadcnt 0x1 |
| ; CHECK-NEXT: v_dual_mov_b32 v44, v0 :: v_dual_mov_b32 v45, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v46, v2 |
| ; CHECK-NEXT: v_dual_mov_b32 v0, v72 :: v_dual_mov_b32 v1, v73 |
| ; CHECK-NEXT: v_dual_mov_b32 v2, v74 :: v_dual_mov_b32 v3, v75 |
| ; CHECK-NEXT: v_mov_b32_e32 v43, v34 |
| ; CHECK-NEXT: v_dual_mov_b32 v42, v33 :: v_dual_mov_b32 v41, v32 |
| ; CHECK-NEXT: v_dual_mov_b32 v40, v31 :: v_dual_mov_b32 v39, v30 |
| ; CHECK-NEXT: v_dual_mov_b32 v38, v29 :: v_dual_mov_b32 v37, v28 |
| ; CHECK-NEXT: v_dual_mov_b32 v36, v27 :: v_dual_mov_b32 v35, v26 |
| ; CHECK-NEXT: v_mov_b32_e32 v34, v25 |
| ; CHECK-NEXT: v_mov_b32_e32 v33, v24 |
| ; CHECK-NEXT: v_mov_b32_e32 v32, v23 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v22 |
| ; CHECK-NEXT: v_mov_b32_e32 v30, v21 |
| ; CHECK-NEXT: v_mov_b32_e32 v29, v20 |
| ; CHECK-NEXT: v_mov_b32_e32 v28, v19 |
| ; CHECK-NEXT: s_clause 0x3 |
| ; CHECK-NEXT: scratch_store_b128 off, v[43:46], s32 offset:48 |
| ; CHECK-NEXT: scratch_store_b128 off, v[39:42], s32 offset:32 |
| ; CHECK-NEXT: scratch_store_b128 off, v[35:38], s32 offset:16 |
| ; CHECK-NEXT: scratch_store_b128 off, v[31:34], s32 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v92 |
| ; CHECK-NEXT: v_dual_mov_b32 v19, v111 :: v_dual_mov_b32 v20, v104 |
| ; CHECK-NEXT: v_dual_mov_b32 v21, v105 :: v_dual_mov_b32 v22, v106 |
| ; CHECK-NEXT: v_dual_mov_b32 v23, v107 :: v_dual_mov_b32 v24, v56 |
| ; CHECK-NEXT: v_dual_mov_b32 v25, v57 :: v_dual_mov_b32 v26, v58 |
| ; CHECK-NEXT: v_mov_b32_e32 v27, v59 |
| ; CHECK-NEXT: s_wait_kmcnt 0x0 |
| ; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; CHECK-NEXT: s_clause 0x1f ; 128-byte Folded Reload |
| ; CHECK-NEXT: scratch_load_b32 v111, off, s33 |
| ; CHECK-NEXT: scratch_load_b32 v110, off, s33 offset:4 |
| ; CHECK-NEXT: scratch_load_b32 v109, off, s33 offset:8 |
| ; CHECK-NEXT: scratch_load_b32 v108, off, s33 offset:12 |
| ; CHECK-NEXT: scratch_load_b32 v107, off, s33 offset:16 |
| ; CHECK-NEXT: scratch_load_b32 v106, off, s33 offset:20 |
| ; CHECK-NEXT: scratch_load_b32 v105, off, s33 offset:24 |
| ; CHECK-NEXT: scratch_load_b32 v104, off, s33 offset:28 |
| ; CHECK-NEXT: scratch_load_b32 v92, off, s33 offset:32 |
| ; CHECK-NEXT: scratch_load_b32 v91, off, s33 offset:36 |
| ; CHECK-NEXT: scratch_load_b32 v90, off, s33 offset:40 |
| ; CHECK-NEXT: scratch_load_b32 v89, off, s33 offset:44 |
| ; CHECK-NEXT: scratch_load_b32 v88, off, s33 offset:48 |
| ; CHECK-NEXT: scratch_load_b32 v79, off, s33 offset:52 |
| ; CHECK-NEXT: scratch_load_b32 v78, off, s33 offset:56 |
| ; CHECK-NEXT: scratch_load_b32 v77, off, s33 offset:60 |
| ; CHECK-NEXT: scratch_load_b32 v76, off, s33 offset:64 |
| ; CHECK-NEXT: scratch_load_b32 v75, off, s33 offset:68 |
| ; CHECK-NEXT: scratch_load_b32 v74, off, s33 offset:72 |
| ; CHECK-NEXT: scratch_load_b32 v73, off, s33 offset:76 |
| ; CHECK-NEXT: scratch_load_b32 v72, off, s33 offset:80 |
| ; CHECK-NEXT: scratch_load_b32 v63, off, s33 offset:84 |
| ; CHECK-NEXT: scratch_load_b32 v62, off, s33 offset:88 |
| ; CHECK-NEXT: scratch_load_b32 v61, off, s33 offset:92 |
| ; CHECK-NEXT: scratch_load_b32 v60, off, s33 offset:96 |
| ; CHECK-NEXT: scratch_load_b32 v59, off, s33 offset:100 |
| ; CHECK-NEXT: scratch_load_b32 v58, off, s33 offset:104 |
| ; CHECK-NEXT: scratch_load_b32 v57, off, s33 offset:108 |
| ; CHECK-NEXT: scratch_load_b32 v56, off, s33 offset:112 |
| ; CHECK-NEXT: scratch_load_b32 v47, off, s33 offset:116 |
| ; CHECK-NEXT: scratch_load_b32 v46, off, s33 offset:120 |
| ; CHECK-NEXT: scratch_load_b32 v45, off, s33 offset:124 |
| ; CHECK-NEXT: s_clause 0x4 ; 20-byte Folded Reload |
| ; CHECK-NEXT: scratch_load_b32 v44, off, s33 offset:128 |
| ; CHECK-NEXT: scratch_load_b32 v43, off, s33 offset:132 |
| ; CHECK-NEXT: scratch_load_b32 v42, off, s33 offset:136 |
| ; CHECK-NEXT: scratch_load_b32 v41, off, s33 offset:140 |
| ; CHECK-NEXT: scratch_load_b32 v40, off, s33 offset:144 |
| ; CHECK-NEXT: v_readlane_b32 s53, v93, 13 |
| ; CHECK-NEXT: v_readlane_b32 s52, v93, 12 |
| ; CHECK-NEXT: v_readlane_b32 s51, v93, 11 |
| ; CHECK-NEXT: v_readlane_b32 s50, v93, 10 |
| ; CHECK-NEXT: v_readlane_b32 s49, v93, 9 |
| ; CHECK-NEXT: v_readlane_b32 s48, v93, 8 |
| ; CHECK-NEXT: v_readlane_b32 s39, v93, 7 |
| ; CHECK-NEXT: v_readlane_b32 s38, v93, 6 |
| ; CHECK-NEXT: v_readlane_b32 s37, v93, 5 |
| ; CHECK-NEXT: v_readlane_b32 s36, v93, 4 |
| ; CHECK-NEXT: v_readlane_b32 s35, v93, 3 |
| ; CHECK-NEXT: v_readlane_b32 s34, v93, 2 |
| ; CHECK-NEXT: v_readlane_b32 s31, v93, 1 |
| ; CHECK-NEXT: v_readlane_b32 s30, v93, 0 |
| ; CHECK-NEXT: s_mov_b32 s32, s33 |
| ; CHECK-NEXT: v_readlane_b32 s0, v93, 14 |
| ; CHECK-NEXT: s_or_saveexec_b32 s1, -1 |
| ; CHECK-NEXT: scratch_load_b32 v93, off, s33 offset:404 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; CHECK-NEXT: s_mov_b32 exec_lo, s1 |
| ; CHECK-NEXT: s_mov_b32 s33, s0 |
| ; CHECK-NEXT: s_wait_loadcnt 0x0 |
| ; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| %A = load <32 x i64>, ptr addrspace(1) null, align 256 |
| %B = call i32 @foo() |
| %C = call <8 x half> @bar(<32 x i64> %A) |
| ret <8 x half> %C |
| } |