| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64 |
| ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0 |
| |
| ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions. |
| |
| define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { |
| ; GFX9_W64-LABEL: mubuf_vgpr: |
| ; GFX9_W64: ; %bb.0: |
| ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec |
| ; GFX9_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v2 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s11, v3 |
| ; GFX9_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] |
| ; GFX9_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] |
| ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] |
| ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; GFX9_W64-NEXT: s_nop 0 |
| ; GFX9_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen |
| ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX9_W64-NEXT: ; implicit-def: $vgpr4 |
| ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; GFX9_W64-NEXT: s_cbranch_execnz .LBB0_1 |
| ; GFX9_W64-NEXT: ; %bb.2: |
| ; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9_W64-NEXT: v_mov_b32_e32 v0, v5 |
| ; GFX9_W64-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1010_W32-LABEL: mubuf_vgpr: |
| ; GFX1010_W32: ; %bb.0: |
| ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo |
| ; GFX1010_W32-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v2 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s11, v3 |
| ; GFX1010_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] |
| ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] |
| ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 |
| ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 |
| ; GFX1010_W32-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen |
| ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4 |
| ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 |
| ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB0_1 |
| ; GFX1010_W32-NEXT: ; %bb.2: |
| ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 |
| ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, v5 |
| ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1010_W64-LABEL: mubuf_vgpr: |
| ; GFX1010_W64: ; %bb.0: |
| ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec |
| ; GFX1010_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v2 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s11, v3 |
| ; GFX1010_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] |
| ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] |
| ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] |
| ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; GFX1010_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen |
| ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4 |
| ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB0_1 |
| ; GFX1010_W64-NEXT: ; %bb.2: |
| ; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, v5 |
| ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1100_W32-LABEL: mubuf_vgpr: |
| ; GFX1100_W32: ; %bb.0: |
| ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo |
| ; GFX1100_W32-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v0 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v1 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s6, v2 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s7, v3 |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] |
| ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 |
| ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 |
| ; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen |
| ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4 |
| ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 |
| ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB0_1 |
| ; GFX1100_W32-NEXT: ; %bb.2: |
| ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, v5 |
| ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1100_W64-LABEL: mubuf_vgpr: |
| ; GFX1100_W64: ; %bb.0: |
| ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec |
| ; GFX1100_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v0 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v1 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s6, v2 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s7, v3 |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] |
| ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] |
| ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] |
| ; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen |
| ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4 |
| ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] |
| ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB0_1 |
| ; GFX1100_W64-NEXT: ; %bb.2: |
| ; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] |
| ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, v5 |
| ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; W64-O0-LABEL: mubuf_vgpr: |
| ; W64-O0: ; %bb.0: |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] |
| ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: v_mov_b32_e32 v5, v2 |
| ; W64-O0-NEXT: v_mov_b32_e32 v2, v1 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v6, v3 |
| ; W64-O0-NEXT: v_mov_b32_e32 v4, v6 |
| ; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v1, v2 |
| ; W64-O0-NEXT: v_mov_b32_e32 v6, v1 |
| ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v1, v6 |
| ; W64-O0-NEXT: v_mov_b32_e32 v2, v5 |
| ; W64-O0-NEXT: v_mov_b32_e32 v3, v4 |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_nop 0 |
| ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; W64-O0-NEXT: s_mov_b32 s4, 0 |
| ; W64-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane |
| ; W64-O0-NEXT: v_writelane_b32 v7, s4, 0 |
| ; W64-O0-NEXT: s_mov_b64 s[4:5], exec |
| ; W64-O0-NEXT: v_writelane_b32 v7, s4, 1 |
| ; W64-O0-NEXT: v_writelane_b32 v7, s5, 2 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(3) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(2) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 |
| ; W64-O0-NEXT: s_mov_b32 s4, s8 |
| ; W64-O0-NEXT: s_mov_b32 s5, s12 |
| ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(1) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 |
| ; W64-O0-NEXT: s_mov_b32 s10, s7 |
| ; W64-O0-NEXT: s_mov_b32 s11, s6 |
| ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] |
| ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] |
| ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 |
| ; W64-O0-NEXT: s_mov_b32 s9, s12 |
| ; W64-O0-NEXT: s_mov_b32 s10, s7 |
| ; W64-O0-NEXT: s_mov_b32 s11, s6 |
| ; W64-O0-NEXT: v_writelane_b32 v7, s8, 3 |
| ; W64-O0-NEXT: v_writelane_b32 v7, s9, 4 |
| ; W64-O0-NEXT: v_writelane_b32 v7, s10, 5 |
| ; W64-O0-NEXT: v_writelane_b32 v7, s11, 6 |
| ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; W64-O0-NEXT: v_writelane_b32 v7, s4, 7 |
| ; W64-O0-NEXT: v_writelane_b32 v7, s5, 8 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s4, v7, 7 |
| ; W64-O0-NEXT: v_readlane_b32 s5, v7, 8 |
| ; W64-O0-NEXT: v_readlane_b32 s8, v7, 3 |
| ; W64-O0-NEXT: v_readlane_b32 s9, v7, 4 |
| ; W64-O0-NEXT: v_readlane_b32 s10, v7, 5 |
| ; W64-O0-NEXT: v_readlane_b32 s11, v7, 6 |
| ; W64-O0-NEXT: v_readlane_b32 s6, v7, 0 |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: s_nop 2 |
| ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 |
| ; W64-O0-NEXT: ; %bb.3: |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s4, v7, 1 |
| ; W64-O0-NEXT: v_readlane_b32 s5, v7, 2 |
| ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; W64-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: s_setpc_b64 s[30:31] |
| %call = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %i, i32 %c, i32 0, i32 0, i32 0) #1 |
| ret float %call |
| } |
| |
| |
| |
| ; FIXME: redundant s_mov |
| |
| define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) %j, i32 %c, ptr addrspace(1) %out0, ptr addrspace(1) %out1) #0 { |
| ; GFX9_W64-LABEL: mubuf_vgpr_adjacent_in_block: |
| ; GFX9_W64: ; %bb.0: ; %entry |
| ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec |
| ; GFX9_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v2 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s11, v3 |
| ; GFX9_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] |
| ; GFX9_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] |
| ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] |
| ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; GFX9_W64-NEXT: s_nop 0 |
| ; GFX9_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen |
| ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_1 |
| ; GFX9_W64-NEXT: ; %bb.2: |
| ; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec |
| ; GFX9_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v4 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v5 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v6 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s11, v7 |
| ; GFX9_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5] |
| ; GFX9_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7] |
| ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] |
| ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; GFX9_W64-NEXT: s_nop 0 |
| ; GFX9_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen |
| ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 |
| ; GFX9_W64-NEXT: ; implicit-def: $vgpr8 |
| ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_3 |
| ; GFX9_W64-NEXT: ; %bb.4: |
| ; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9_W64-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9_W64-NEXT: global_store_dword v[9:10], v13, off |
| ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9_W64-NEXT: global_store_dword v[11:12], v0, off |
| ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9_W64-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1010_W32-LABEL: mubuf_vgpr_adjacent_in_block: |
| ; GFX1010_W32: ; %bb.0: ; %entry |
| ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo |
| ; GFX1010_W32-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v2 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s11, v3 |
| ; GFX1010_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] |
| ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] |
| ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 |
| ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 |
| ; GFX1010_W32-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen |
| ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 |
| ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_1 |
| ; GFX1010_W32-NEXT: ; %bb.2: |
| ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 |
| ; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo |
| ; GFX1010_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v4 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v5 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v6 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s11, v7 |
| ; GFX1010_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5] |
| ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[6:7] |
| ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 |
| ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 |
| ; GFX1010_W32-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen |
| ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 |
| ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 |
| ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 |
| ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_3 |
| ; GFX1010_W32-NEXT: ; %bb.4: |
| ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 |
| ; GFX1010_W32-NEXT: s_waitcnt vmcnt(1) |
| ; GFX1010_W32-NEXT: global_store_dword v[9:10], v13, off |
| ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1010_W32-NEXT: global_store_dword v[11:12], v0, off |
| ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1010_W64-LABEL: mubuf_vgpr_adjacent_in_block: |
| ; GFX1010_W64: ; %bb.0: ; %entry |
| ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec |
| ; GFX1010_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v2 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s11, v3 |
| ; GFX1010_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] |
| ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] |
| ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] |
| ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; GFX1010_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen |
| ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_1 |
| ; GFX1010_W64-NEXT: ; %bb.2: |
| ; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec |
| ; GFX1010_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v4 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v5 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v6 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s11, v7 |
| ; GFX1010_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5] |
| ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7] |
| ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] |
| ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; GFX1010_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen |
| ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 |
| ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 |
| ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_3 |
| ; GFX1010_W64-NEXT: ; %bb.4: |
| ; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX1010_W64-NEXT: s_waitcnt vmcnt(1) |
| ; GFX1010_W64-NEXT: global_store_dword v[9:10], v13, off |
| ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1010_W64-NEXT: global_store_dword v[11:12], v0, off |
| ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1100_W32-LABEL: mubuf_vgpr_adjacent_in_block: |
| ; GFX1100_W32: ; %bb.0: ; %entry |
| ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo |
| ; GFX1100_W32-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v0 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v1 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s6, v2 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s7, v3 |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] |
| ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 |
| ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 |
| ; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen |
| ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 |
| ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_1 |
| ; GFX1100_W32-NEXT: ; %bb.2: |
| ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo |
| ; GFX1100_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v4 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v5 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s6, v6 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s7, v7 |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] |
| ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 |
| ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 |
| ; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen |
| ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 |
| ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 |
| ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 |
| ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_3 |
| ; GFX1100_W32-NEXT: ; %bb.4: |
| ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX1100_W32-NEXT: s_waitcnt vmcnt(1) |
| ; GFX1100_W32-NEXT: global_store_b32 v[9:10], v13, off dlc |
| ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1100_W32-NEXT: global_store_b32 v[11:12], v0, off dlc |
| ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1100_W64-LABEL: mubuf_vgpr_adjacent_in_block: |
| ; GFX1100_W64: ; %bb.0: ; %entry |
| ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec |
| ; GFX1100_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v0 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v1 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s6, v2 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s7, v3 |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] |
| ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] |
| ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] |
| ; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen |
| ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] |
| ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_1 |
| ; GFX1100_W64-NEXT: ; %bb.2: |
| ; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec |
| ; GFX1100_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v4 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v5 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s6, v6 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s7, v7 |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] |
| ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7] |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] |
| ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] |
| ; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen |
| ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 |
| ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 |
| ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] |
| ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_3 |
| ; GFX1100_W64-NEXT: ; %bb.4: |
| ; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] |
| ; GFX1100_W64-NEXT: s_waitcnt vmcnt(1) |
| ; GFX1100_W64-NEXT: global_store_b32 v[9:10], v13, off dlc |
| ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1100_W64-NEXT: global_store_b32 v[11:12], v0, off dlc |
| ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; W64-O0-LABEL: mubuf_vgpr_adjacent_in_block: |
| ; W64-O0: ; %bb.0: ; %entry |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] |
| ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: v_mov_b32_e32 v14, v6 |
| ; W64-O0-NEXT: v_mov_b32_e32 v8, v5 |
| ; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: v_mov_b32_e32 v13, v4 |
| ; W64-O0-NEXT: v_mov_b32_e32 v7, v2 |
| ; W64-O0-NEXT: v_mov_b32_e32 v4, v1 |
| ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: v_mov_b32_e32 v2, v0 |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec |
| ; W64-O0-NEXT: s_waitcnt vmcnt(2) |
| ; W64-O0-NEXT: v_mov_b32_e32 v15, v5 |
| ; W64-O0-NEXT: v_mov_b32_e32 v5, v15 |
| ; W64-O0-NEXT: v_mov_b32_e32 v6, v14 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v14, v8 |
| ; W64-O0-NEXT: v_mov_b32_e32 v8, v14 |
| ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v14, v8 |
| ; W64-O0-NEXT: v_mov_b32_e32 v15, v6 |
| ; W64-O0-NEXT: v_mov_b32_e32 v16, v5 |
| ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_nop 0 |
| ; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v8, v3 |
| ; W64-O0-NEXT: v_mov_b32_e32 v6, v8 |
| ; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v3, v4 |
| ; W64-O0-NEXT: v_mov_b32_e32 v8, v3 |
| ; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v3, v8 |
| ; W64-O0-NEXT: v_mov_b32_e32 v4, v7 |
| ; W64-O0-NEXT: v_mov_b32_e32 v5, v6 |
| ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_nop 0 |
| ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v2, v12 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(9) |
| ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_nop 0 |
| ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v1, v10 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(10) |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_nop 0 |
| ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; W64-O0-NEXT: s_mov_b32 s4, 0 |
| ; W64-O0-NEXT: ; implicit-def: $vgpr17 : SGPR spill to VGPR lane |
| ; W64-O0-NEXT: v_writelane_b32 v17, s4, 0 |
| ; W64-O0-NEXT: s_mov_b64 s[4:5], exec |
| ; W64-O0-NEXT: v_writelane_b32 v17, s4, 1 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s5, 2 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(3) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(2) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 |
| ; W64-O0-NEXT: s_mov_b32 s4, s8 |
| ; W64-O0-NEXT: s_mov_b32 s5, s12 |
| ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(1) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 |
| ; W64-O0-NEXT: s_mov_b32 s10, s7 |
| ; W64-O0-NEXT: s_mov_b32 s11, s6 |
| ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] |
| ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] |
| ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 |
| ; W64-O0-NEXT: s_mov_b32 s9, s12 |
| ; W64-O0-NEXT: s_mov_b32 s10, s7 |
| ; W64-O0-NEXT: s_mov_b32 s11, s6 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s8, 3 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s9, 4 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s10, 5 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s11, 6 |
| ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; W64-O0-NEXT: v_writelane_b32 v17, s4, 7 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s5, 8 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s4, v17, 7 |
| ; W64-O0-NEXT: v_readlane_b32 s5, v17, 8 |
| ; W64-O0-NEXT: v_readlane_b32 s8, v17, 3 |
| ; W64-O0-NEXT: v_readlane_b32 s9, v17, 4 |
| ; W64-O0-NEXT: v_readlane_b32 s10, v17, 5 |
| ; W64-O0-NEXT: v_readlane_b32 s11, v17, 6 |
| ; W64-O0-NEXT: v_readlane_b32 s6, v17, 0 |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: s_nop 2 |
| ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 |
| ; W64-O0-NEXT: ; %bb.3: |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s4, v17, 1 |
| ; W64-O0-NEXT: v_readlane_b32 s5, v17, 2 |
| ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] |
| ; W64-O0-NEXT: s_mov_b64 s[4:5], exec |
| ; W64-O0-NEXT: v_writelane_b32 v17, s4, 9 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s5, 10 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: .LBB1_4: ; =>This Inner Loop Header: Depth=1 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(3) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(2) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 |
| ; W64-O0-NEXT: s_mov_b32 s4, s8 |
| ; W64-O0-NEXT: s_mov_b32 s5, s12 |
| ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(1) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 |
| ; W64-O0-NEXT: s_mov_b32 s10, s7 |
| ; W64-O0-NEXT: s_mov_b32 s11, s6 |
| ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] |
| ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] |
| ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 |
| ; W64-O0-NEXT: s_mov_b32 s9, s12 |
| ; W64-O0-NEXT: s_mov_b32 s10, s7 |
| ; W64-O0-NEXT: s_mov_b32 s11, s6 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s8, 11 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s9, 12 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s10, 13 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s11, 14 |
| ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; W64-O0-NEXT: v_writelane_b32 v17, s4, 15 |
| ; W64-O0-NEXT: v_writelane_b32 v17, s5, 16 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s4, v17, 15 |
| ; W64-O0-NEXT: v_readlane_b32 s5, v17, 16 |
| ; W64-O0-NEXT: v_readlane_b32 s8, v17, 11 |
| ; W64-O0-NEXT: v_readlane_b32 s9, v17, 12 |
| ; W64-O0-NEXT: v_readlane_b32 s10, v17, 13 |
| ; W64-O0-NEXT: v_readlane_b32 s11, v17, 14 |
| ; W64-O0-NEXT: v_readlane_b32 s6, v17, 0 |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: s_nop 2 |
| ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 |
| ; W64-O0-NEXT: ; %bb.6: |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s4, v17, 9 |
| ; W64-O0-NEXT: v_readlane_b32 s5, v17, 10 |
| ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] |
| ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: global_store_dword v[3:4], v5, off |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: global_store_dword v[0:1], v2, off |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; W64-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| %val0 = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %i, i32 %c, i32 0, i32 0, i32 0) #1 |
| %val1 = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %j, i32 %c, i32 0, i32 0, i32 0) #1 |
| store volatile float %val0, ptr addrspace(1) %out0 |
| store volatile float %val1, ptr addrspace(1) %out1 |
| ret void |
| } |
| |
| ; Confirm spills do not occur between the XOR and branch that terminate the |
| ; waterfall loop BBs. |
| |
| define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, i32 %c, ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { |
| ; GFX9_W64-LABEL: mubuf_vgpr_outside_entry: |
| ; GFX9_W64: ; %bb.0: ; %entry |
| ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9_W64-NEXT: ;;#ASMSTART |
| ; GFX9_W64-NEXT: s_mov_b32 s4, 17 |
| ; GFX9_W64-NEXT: ;;#ASMEND |
| ; GFX9_W64-NEXT: v_mov_b32_e32 v8, s4 |
| ; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec |
| ; GFX9_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v2 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s11, v3 |
| ; GFX9_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] |
| ; GFX9_W64-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3] |
| ; GFX9_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] |
| ; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] |
| ; GFX9_W64-NEXT: s_nop 0 |
| ; GFX9_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen |
| ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX9_W64-NEXT: ; implicit-def: $vgpr8 |
| ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[6:7] |
| ; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_1 |
| ; GFX9_W64-NEXT: ; %bb.2: |
| ; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13] |
| ; GFX9_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 |
| ; GFX9_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 |
| ; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc |
| ; GFX9_W64-NEXT: s_cbranch_execz .LBB2_6 |
| ; GFX9_W64-NEXT: ; %bb.3: ; %bb1 |
| ; GFX9_W64-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec |
| ; GFX9_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v4 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v5 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v6 |
| ; GFX9_W64-NEXT: v_readfirstlane_b32 s11, v7 |
| ; GFX9_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5] |
| ; GFX9_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7] |
| ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] |
| ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; GFX9_W64-NEXT: s_nop 0 |
| ; GFX9_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen |
| ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 |
| ; GFX9_W64-NEXT: ; implicit-def: $vgpr0 |
| ; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_4 |
| ; GFX9_W64-NEXT: ; %bb.5: |
| ; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13] |
| ; GFX9_W64-NEXT: .LBB2_6: ; %bb2 |
| ; GFX9_W64-NEXT: s_or_b64 exec, exec, s[6:7] |
| ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9_W64-NEXT: global_store_dword v[11:12], v9, off |
| ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9_W64-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1010_W32-LABEL: mubuf_vgpr_outside_entry: |
| ; GFX1010_W32: ; %bb.0: ; %entry |
| ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1010_W32-NEXT: ;;#ASMSTART |
| ; GFX1010_W32-NEXT: s_mov_b32 s4, 17 |
| ; GFX1010_W32-NEXT: ;;#ASMEND |
| ; GFX1010_W32-NEXT: v_mov_b32_e32 v8, s4 |
| ; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo |
| ; GFX1010_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v2 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s11, v3 |
| ; GFX1010_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] |
| ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[2:3] |
| ; GFX1010_W32-NEXT: s_and_b32 s5, vcc_lo, s5 |
| ; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, s5 |
| ; GFX1010_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen |
| ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 |
| ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s5 |
| ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_1 |
| ; GFX1010_W32-NEXT: ; %bb.2: |
| ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX1010_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31 |
| ; GFX1010_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 |
| ; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, vcc_lo |
| ; GFX1010_W32-NEXT: s_cbranch_execz .LBB2_6 |
| ; GFX1010_W32-NEXT: ; %bb.3: ; %bb1 |
| ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo |
| ; GFX1010_W32-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v4 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v5 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v6 |
| ; GFX1010_W32-NEXT: v_readfirstlane_b32 s11, v7 |
| ; GFX1010_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5] |
| ; GFX1010_W32-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[6:7] |
| ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 |
| ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 |
| ; GFX1010_W32-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen |
| ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 |
| ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0 |
| ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 |
| ; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_4 |
| ; GFX1010_W32-NEXT: ; %bb.5: |
| ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 |
| ; GFX1010_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1010_W32-NEXT: global_store_dword v[11:12], v9, off |
| ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1010_W64-LABEL: mubuf_vgpr_outside_entry: |
| ; GFX1010_W64: ; %bb.0: ; %entry |
| ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1010_W64-NEXT: ;;#ASMSTART |
| ; GFX1010_W64-NEXT: s_mov_b32 s4, 17 |
| ; GFX1010_W64-NEXT: ;;#ASMEND |
| ; GFX1010_W64-NEXT: v_mov_b32_e32 v8, s4 |
| ; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec |
| ; GFX1010_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v2 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s11, v3 |
| ; GFX1010_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] |
| ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3] |
| ; GFX1010_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] |
| ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] |
| ; GFX1010_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen |
| ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 |
| ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[6:7] |
| ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_1 |
| ; GFX1010_W64-NEXT: ; %bb.2: |
| ; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13] |
| ; GFX1010_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 |
| ; GFX1010_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 |
| ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc |
| ; GFX1010_W64-NEXT: s_cbranch_execz .LBB2_6 |
| ; GFX1010_W64-NEXT: ; %bb.3: ; %bb1 |
| ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec |
| ; GFX1010_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v4 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v5 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v6 |
| ; GFX1010_W64-NEXT: v_readfirstlane_b32 s11, v7 |
| ; GFX1010_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5] |
| ; GFX1010_W64-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7] |
| ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] |
| ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; GFX1010_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen |
| ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 |
| ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0 |
| ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_4 |
| ; GFX1010_W64-NEXT: ; %bb.5: |
| ; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13] |
| ; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 |
| ; GFX1010_W64-NEXT: s_or_b64 exec, exec, s[6:7] |
| ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1010_W64-NEXT: global_store_dword v[11:12], v9, off |
| ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1100_W32-LABEL: mubuf_vgpr_outside_entry: |
| ; GFX1100_W32: ; %bb.0: ; %entry |
| ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1100_W32-NEXT: ;;#ASMSTART |
| ; GFX1100_W32-NEXT: s_mov_b32 s4, 17 |
| ; GFX1100_W32-NEXT: ;;#ASMEND |
| ; GFX1100_W32-NEXT: v_mov_b32_e32 v8, s4 |
| ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo |
| ; GFX1100_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s8, v0 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s9, v1 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s10, v2 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s11, v3 |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] |
| ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[10:11], v[2:3] |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 |
| ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 |
| ; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen |
| ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 |
| ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 |
| ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_1 |
| ; GFX1100_W32-NEXT: ; %bb.2: |
| ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX1100_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31 |
| ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1100_W32-NEXT: v_cmpx_eq_u32_e32 0, v0 |
| ; GFX1100_W32-NEXT: s_cbranch_execz .LBB2_6 |
| ; GFX1100_W32-NEXT: ; %bb.3: ; %bb1 |
| ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX1100_W32-NEXT: s_mov_b32 s2, exec_lo |
| ; GFX1100_W32-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v4 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v5 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s6, v6 |
| ; GFX1100_W32-NEXT: v_readfirstlane_b32 s7, v7 |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] |
| ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 |
| ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 |
| ; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen |
| ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 |
| ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0 |
| ; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 |
| ; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_4 |
| ; GFX1100_W32-NEXT: ; %bb.5: |
| ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s2 |
| ; GFX1100_W32-NEXT: .LBB2_6: ; %bb2 |
| ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX1100_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 |
| ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1100_W32-NEXT: global_store_b32 v[11:12], v9, off dlc |
| ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX1100_W64-LABEL: mubuf_vgpr_outside_entry: |
| ; GFX1100_W64: ; %bb.0: ; %entry |
| ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX1100_W64-NEXT: ;;#ASMSTART |
| ; GFX1100_W64-NEXT: s_mov_b32 s4, 17 |
| ; GFX1100_W64-NEXT: ;;#ASMEND |
| ; GFX1100_W64-NEXT: v_mov_b32_e32 v8, s4 |
| ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec |
| ; GFX1100_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s8, v0 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s9, v1 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s10, v2 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s11, v3 |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] |
| ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3] |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] |
| ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] |
| ; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen |
| ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 |
| ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 |
| ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] |
| ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_1 |
| ; GFX1100_W64-NEXT: ; %bb.2: |
| ; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] |
| ; GFX1100_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 |
| ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1100_W64-NEXT: v_cmpx_eq_u32_e32 0, v0 |
| ; GFX1100_W64-NEXT: s_cbranch_execz .LBB2_6 |
| ; GFX1100_W64-NEXT: ; %bb.3: ; %bb1 |
| ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX1100_W64-NEXT: s_mov_b64 s[8:9], exec |
| ; GFX1100_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v4 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v5 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s6, v6 |
| ; GFX1100_W64-NEXT: v_readfirstlane_b32 s7, v7 |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] |
| ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7] |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] |
| ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] |
| ; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen |
| ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 |
| ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0 |
| ; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] |
| ; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_4 |
| ; GFX1100_W64-NEXT: ; %bb.5: |
| ; GFX1100_W64-NEXT: s_mov_b64 exec, s[8:9] |
| ; GFX1100_W64-NEXT: .LBB2_6: ; %bb2 |
| ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX1100_W64-NEXT: s_or_b64 exec, exec, s[2:3] |
| ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) |
| ; GFX1100_W64-NEXT: global_store_b32 v[11:12], v9, off dlc |
| ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; W64-O0-LABEL: mubuf_vgpr_outside_entry: |
| ; W64-O0: ; %bb.0: ; %entry |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] |
| ; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: v_mov_b32_e32 v5, v4 |
| ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_nop 0 |
| ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: v_mov_b32_e32 v9, v2 |
| ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: v_mov_b32_e32 v6, v1 |
| ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: v_mov_b32_e32 v8, v0 |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v10, v3 |
| ; W64-O0-NEXT: v_mov_b32_e32 v3, v10 |
| ; W64-O0-NEXT: v_mov_b32_e32 v5, v9 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v9, v6 |
| ; W64-O0-NEXT: v_mov_b32_e32 v6, v9 |
| ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v9, v6 |
| ; W64-O0-NEXT: v_mov_b32_e32 v10, v5 |
| ; W64-O0-NEXT: v_mov_b32_e32 v11, v3 |
| ; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_nop 0 |
| ; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v5, v7 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec |
| ; W64-O0-NEXT: s_waitcnt vmcnt(5) |
| ; W64-O0-NEXT: v_mov_b32_e32 v3, v1 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4 |
| ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v1, v12 |
| ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_nop 0 |
| ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_nop 0 |
| ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(8) |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_nop 0 |
| ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: ;;#ASMSTART |
| ; W64-O0-NEXT: s_mov_b32 s4, 17 |
| ; W64-O0-NEXT: ;;#ASMEND |
| ; W64-O0-NEXT: s_mov_b32 s5, s4 |
| ; W64-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane |
| ; W64-O0-NEXT: v_writelane_b32 v13, s5, 0 |
| ; W64-O0-NEXT: s_mov_b32 s5, 0 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s5, 1 |
| ; W64-O0-NEXT: v_mov_b32_e32 v0, s4 |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 s[4:5], exec |
| ; W64-O0-NEXT: v_writelane_b32 v13, s4, 2 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s5, 3 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(3) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(2) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 |
| ; W64-O0-NEXT: s_mov_b32 s4, s8 |
| ; W64-O0-NEXT: s_mov_b32 s5, s12 |
| ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(1) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 |
| ; W64-O0-NEXT: s_mov_b32 s10, s7 |
| ; W64-O0-NEXT: s_mov_b32 s11, s6 |
| ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] |
| ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] |
| ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 |
| ; W64-O0-NEXT: s_mov_b32 s9, s12 |
| ; W64-O0-NEXT: s_mov_b32 s10, s7 |
| ; W64-O0-NEXT: s_mov_b32 s11, s6 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s8, 4 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s9, 5 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s10, 6 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s11, 7 |
| ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; W64-O0-NEXT: v_writelane_b32 v13, s4, 8 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s5, 9 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s4, v13, 8 |
| ; W64-O0-NEXT: v_readlane_b32 s5, v13, 9 |
| ; W64-O0-NEXT: v_readlane_b32 s8, v13, 4 |
| ; W64-O0-NEXT: v_readlane_b32 s9, v13, 5 |
| ; W64-O0-NEXT: v_readlane_b32 s10, v13, 6 |
| ; W64-O0-NEXT: v_readlane_b32 s11, v13, 7 |
| ; W64-O0-NEXT: v_readlane_b32 s6, v13, 1 |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: s_nop 2 |
| ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 |
| ; W64-O0-NEXT: ; %bb.3: |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s6, v13, 2 |
| ; W64-O0-NEXT: v_readlane_b32 s7, v13, 3 |
| ; W64-O0-NEXT: s_mov_b64 exec, s[6:7] |
| ; W64-O0-NEXT: v_readlane_b32 s4, v13, 1 |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_and_b32_e64 v1, v1, s5 |
| ; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 s[4:5], exec |
| ; W64-O0-NEXT: v_writelane_b32 v13, s4, 10 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s5, 11 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] |
| ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] |
| ; W64-O0-NEXT: s_cbranch_execz .LBB2_8 |
| ; W64-O0-NEXT: ; %bb.4: ; %bb1 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s4, v13, 0 |
| ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_mov_b32_e32 v6, v4 |
| ; W64-O0-NEXT: v_mov_b32_e32 v0, v3 |
| ; W64-O0-NEXT: v_mov_b32_e32 v4, v2 |
| ; W64-O0-NEXT: v_mov_b32_e32 v5, v1 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr5 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr5 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr5 |
| ; W64-O0-NEXT: ; implicit-def: $sgpr5 |
| ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec |
| ; W64-O0-NEXT: v_mov_b32_e32 v1, v6 |
| ; W64-O0-NEXT: v_mov_b32_e32 v2, v5 |
| ; W64-O0-NEXT: v_mov_b32_e32 v3, v4 |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_nop 0 |
| ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b32 s5, 0 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s5, 12 |
| ; W64-O0-NEXT: v_mov_b32_e32 v0, s4 |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 s[4:5], exec |
| ; W64-O0-NEXT: v_writelane_b32 v13, s4, 13 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s5, 14 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: .LBB2_5: ; =>This Inner Loop Header: Depth=1 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(3) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(2) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 |
| ; W64-O0-NEXT: s_mov_b32 s4, s8 |
| ; W64-O0-NEXT: s_mov_b32 s5, s12 |
| ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(1) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 |
| ; W64-O0-NEXT: s_mov_b32 s10, s7 |
| ; W64-O0-NEXT: s_mov_b32 s11, s6 |
| ; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] |
| ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] |
| ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 |
| ; W64-O0-NEXT: s_mov_b32 s9, s12 |
| ; W64-O0-NEXT: s_mov_b32 s10, s7 |
| ; W64-O0-NEXT: s_mov_b32 s11, s6 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s8, 15 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s9, 16 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s10, 17 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s11, 18 |
| ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] |
| ; W64-O0-NEXT: v_writelane_b32 v13, s4, 19 |
| ; W64-O0-NEXT: v_writelane_b32 v13, s5, 20 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s4, v13, 19 |
| ; W64-O0-NEXT: v_readlane_b32 s5, v13, 20 |
| ; W64-O0-NEXT: v_readlane_b32 s8, v13, 15 |
| ; W64-O0-NEXT: v_readlane_b32 s9, v13, 16 |
| ; W64-O0-NEXT: v_readlane_b32 s10, v13, 17 |
| ; W64-O0-NEXT: v_readlane_b32 s11, v13, 18 |
| ; W64-O0-NEXT: v_readlane_b32 s6, v13, 12 |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: s_nop 2 |
| ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] |
| ; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 |
| ; W64-O0-NEXT: ; %bb.7: |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s4, v13, 13 |
| ; W64-O0-NEXT: v_readlane_b32 s5, v13, 14 |
| ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill |
| ; W64-O0-NEXT: .LBB2_8: ; %bb2 |
| ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 |
| ; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: v_readlane_b32 s4, v13, 10 |
| ; W64-O0-NEXT: v_readlane_b32 s5, v13, 11 |
| ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] |
| ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: global_store_dword v[0:1], v2, off |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload |
| ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] |
| ; W64-O0-NEXT: s_waitcnt vmcnt(0) |
| ; W64-O0-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" () |
| %val0 = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %i, i32 %live.out.reg, i32 0, i32 0, i32 0) #1 |
| %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %cmp = icmp eq i32 %idx, 0 |
| br i1 %cmp, label %bb1, label %bb2 |
| |
| bb1: |
| %val1 = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %j, i32 %live.out.reg, i32 0, i32 0, i32 0) #1 |
| br label %bb2 |
| |
| bb2: |
| %val = phi float [ %val0, %entry ], [ %val1, %bb1 ] |
| store volatile float %val, ptr addrspace(1) %out |
| ret void |
| } |
| |
| declare i32 @llvm.amdgcn.workitem.id.x() #1 |
| declare float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) #1 |
| |
| attributes #0 = { nounwind } |
| attributes #1 = { nounwind readonly } |