| # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s | 
 | # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s | 
 | # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s | 
 |  | 
 | --- | 
 |  | 
 | # The loop contains a store and a use of a value loaded outside of the loop. | 
 | # We expect the waitcnt for the use to be hoisted on GFX9, but not on GFX10+ | 
 | # because we have the vscnt counter. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # Same as before, but the loop preheader has no terminator. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop_noterm | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop_noterm | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop_noterm | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop_noterm | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # Same as before but there is a preexisting waitcnt in the preheader. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop_noterm_wait | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop_noterm_wait | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     S_WAITCNT 3952 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # The loop contains a store, a load, and uses values loaded both inside and | 
 | # outside the loop. | 
 | # We do not expect the waitcnt to be hoisted out of the loop. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop_load | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop_load | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop_load | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop_load | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     $vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # The loop contains a use of a value loaded outside of the loop, and no store | 
 | # nor load. | 
 | # We do not expect the waitcnt to be hoisted out of the loop. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop_no_store | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop_no_store | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop_no_store | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop_no_store | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # The loop contains a store, no load, and doesn't use any value loaded inside | 
 | # or outside of the loop. There is only one use of the loaded value in the | 
 | # exit block. | 
 | # We don't expect any s_waitcnt vmcnt in the loop body or preheader, but expect | 
 | # one in the exit block. | 
 |  | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop_no_use | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop_no_use | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop_no_use | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop_no_use | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     $vgpr1 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # The loop loads a value that is not used in the loop, and uses a value loaded | 
 | # outside of the loop. | 
 | # We expect the waitcnt to be hoisted of the loop to wait a single time before | 
 | # the loop is executed and avoid waiting for the load to complete on each | 
 | # iteration. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop2 | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop2 | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop2 | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop2 | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec | 
 |     $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # Same as before with an additional store in the loop. We still expect the | 
 | # waitcnt instructions to be hoisted. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop2_store | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop2_store | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop2_store | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop2_store | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec | 
 |     $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # Same as loop2 but the value loaded inside the loop is also used in the loop. | 
 | # We do not expect the waitcnt to be hoisted out of the loop. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop2_use_in_loop | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop2_use_in_loop | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop2_use_in_loop | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop2_use_in_loop | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec | 
 |     $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     $vgpr4 = V_ADD_U32_e32 $vgpr5, $vgpr1, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # The loop contains a use of a value loaded outside of the loop, but we already | 
 | # waited for that load to complete. The loop also loads a value that is not used | 
 | # in the loop. We do not expect any waitcnt in the loop. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop2_nowait | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.3: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop2_nowait | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.3: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop2_nowait | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.3: | 
 | name:            waitcnt_vm_loop2_nowait | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.2 | 
 |  | 
 |     $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec | 
 |     $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec | 
 |     $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec | 
 |  | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     successors: %bb.2, %bb.3 | 
 |  | 
 |     $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec | 
 |     $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.2, implicit killed $scc | 
 |     S_BRANCH %bb.3 | 
 |  | 
 |   bb.3: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # Similar test case but for register intervals. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop2_reginterval | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop2_reginterval | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop2_reginterval | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12: GLOBAL_LOAD_DWORDX4 | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop2_reginterval | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec | 
 |  | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     $vgpr10 = COPY $vgpr0 | 
 |  | 
 |     $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # Similar test case but for register intervals. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop2_reginterval2 | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop2_reginterval2 | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop2_reginterval2 | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop2_reginterval2 | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec | 
 |  | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     $vgpr10 = COPY $vgpr0 | 
 |  | 
 |     $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) | 
 |     $vgpr11 = COPY $vgpr7 | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # The loop loads a value that is not used in the loop, but uses a value loaded | 
 | # outside of it. We expect the s_waitcnt instruction to be hoisted. | 
 | # A s_waitcnt vmcnt(0) is generated to flush in the preheader, but for this | 
 | # specific test case, it would be better to use vmcnt(1) instead. This is | 
 | # currently not implemented. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_zero | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 3952 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_zero | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10: S_WAITCNT 16240 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10-NOT: S_WAITCNT 16240 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_zero | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN | 
 | # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 |  | 
 | name:            waitcnt_vm_zero | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |     $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr3, implicit $exec | 
 |     $vgpr2 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr3, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # This test case checks that we flush the vmcnt counter only if necessary | 
 | # (i.e. if a waitcnt is needed for the vgpr use we find in the loop) | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_necessary | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10: S_WAITCNT 16240 | 
 | # GFX10: $vgpr4 | 
 | # GFX10-NOT: S_WAITCNT | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10-NOT: S_WAITCNT | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_necessary | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12: $vgpr4 | 
 | # GFX12-NOT: S_WAITCNT | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12-NOT: S_WAITCNT | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_necessary | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 3952 | 
 | # GFX9: $vgpr4 | 
 | # GFX9-NOT: S_WAITCNT | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT | 
 |  | 
 | name:            waitcnt_vm_necessary | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1(0x80000000) | 
 |  | 
 |     $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 killed $vgpr0_vgpr1, 0, 0, implicit $exec | 
 |     $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1(0x40000000) | 
 |  | 
 |     $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # The loop contains a global store, and uses a (global) loaded value outside of the loop. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop_global_mem | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop_global_mem | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop_global_mem | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 |  | 
 | name:            waitcnt_vm_loop_global_mem | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec | 
 |     GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |  | 
 |   bb.2: | 
 |     successors: %bb.3 | 
 |     S_BRANCH %bb.3 | 
 |  | 
 |   bb.3: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # Same as above case, but use scratch memory instructions instead | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop_scratch_mem | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop_scratch_mem | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop_scratch_mem | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 |  | 
 | name:            waitcnt_vm_loop_scratch_mem | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |     $vgpr0 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec | 
 |     SCRATCH_STORE_DWORD $vgpr4, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |  | 
 |   bb.2: | 
 |     successors: %bb.3 | 
 |     S_BRANCH %bb.3 | 
 |  | 
 |   bb.3: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # Same as above case, but use flat memory instructions instead | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop_flat_mem | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop_flat_mem | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10-NOT: S_WAITCNT 11 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10: S_WAITCNT 11 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop_flat_mem | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12: FLAT_LOAD_DWORD | 
 | # GFX12-NOT: S_WAIT_LOADCNT_DSCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT_DSCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop_flat_mem | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |     $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec | 
 |     FLAT_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |  | 
 |   bb.2: | 
 |     successors: %bb.3 | 
 |     S_BRANCH %bb.3 | 
 |  | 
 |   bb.3: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... | 
 | --- | 
 |  | 
 | # The loop contains a store, a load, and uses values loaded both inside and | 
 | # outside the loop. | 
 | # We do not expect the waitcnt to be hoisted out of the loop. | 
 |  | 
 | # GFX9-LABEL: waitcnt_vm_loop_flat_load | 
 | # GFX9-LABEL: bb.0: | 
 | # GFX9-NOT: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.1: | 
 | # GFX9: S_WAITCNT 39 | 
 | # GFX9-LABEL: bb.2: | 
 |  | 
 | # GFX10-LABEL: waitcnt_vm_loop_flat_load | 
 | # GFX10-LABEL: bb.0: | 
 | # GFX10-NOT: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.1: | 
 | # GFX10: S_WAITCNT 16 | 
 | # GFX10-LABEL: bb.2: | 
 |  | 
 | # GFX12-LABEL: waitcnt_vm_loop_flat_load | 
 | # GFX12-LABEL: bb.0: | 
 | # GFX12-NOT: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.1: | 
 | # GFX12: S_WAIT_LOADCNT 0 | 
 | # GFX12-LABEL: bb.2: | 
 | name:            waitcnt_vm_loop_flat_load | 
 | body:             | | 
 |   bb.0: | 
 |     successors: %bb.1 | 
 |  | 
 |     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec | 
 |     S_BRANCH %bb.1 | 
 |  | 
 |   bb.1: | 
 |     successors: %bb.1, %bb.2 | 
 |  | 
 |     GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec | 
 |     $vgpr7 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec | 
 |     $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec | 
 |     S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc | 
 |     S_CBRANCH_SCC1 %bb.1, implicit killed $scc | 
 |     S_BRANCH %bb.2 | 
 |  | 
 |   bb.2: | 
 |     S_ENDPGM 0 | 
 |  | 
 | ... |