[AMDGPU][NFC] Split and auto-generate ds.gws.barrier test checks (#190680)

Split MIR checks into a separate test file.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-bundle.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-bundle.ll
new file mode 100644
index 0000000..35a13a0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-bundle.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos < %s | FileCheck -check-prefix=GFX6-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos < %s | FileCheck -check-prefix=GFX6-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos < %s | FileCheck -check-prefix=GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos < %s | FileCheck -check-prefix=GFX9-GISEL %s
+
+; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos.
+
+define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
+  ; GFX6-SDAG-LABEL: name: gws_barrier_offset0
+  ; GFX6-SDAG: bb.0 (%ir-block.0):
+  ; GFX6-SDAG-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX6-SDAG-NEXT:   liveins: $sgpr8_sgpr9
+  ; GFX6-SDAG-NEXT: {{  $}}
+  ; GFX6-SDAG-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset1, align 16, addrspace 4)
+  ; GFX6-SDAG-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX6-SDAG-NEXT:   $m0 = S_MOV_B32 0
+  ; GFX6-SDAG-NEXT: {{  $}}
+  ; GFX6-SDAG-NEXT: bb.1:
+  ; GFX6-SDAG-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX6-SDAG-NEXT:   liveins: $vgpr0
+  ; GFX6-SDAG-NEXT: {{  $}}
+  ; GFX6-SDAG-NEXT:   S_SETREG_IMM32_B32 0, 515, implicit-def $mode, implicit $mode
+  ; GFX6-SDAG-NEXT:   BUNDLE implicit renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+  ; GFX6-SDAG-NEXT:     DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+  ; GFX6-SDAG-NEXT:     S_WAITCNT 0
+  ; GFX6-SDAG-NEXT:   }
+  ; GFX6-SDAG-NEXT:   renamable $sgpr0 = S_GETREG_B32 515, implicit $mode
+  ; GFX6-SDAG-NEXT:   S_CMP_LG_U32 killed renamable $sgpr0, 0, implicit-def $scc
+  ; GFX6-SDAG-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+  ; GFX6-SDAG-NEXT: {{  $}}
+  ; GFX6-SDAG-NEXT: bb.2:
+  ; GFX6-SDAG-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX6-GISEL-LABEL: name: gws_barrier_offset0
+  ; GFX6-GISEL: bb.0 (%ir-block.0):
+  ; GFX6-GISEL-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX6-GISEL-NEXT:   liveins: $sgpr8_sgpr9
+  ; GFX6-GISEL-NEXT: {{  $}}
+  ; GFX6-GISEL-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset1, align 16, addrspace 4)
+  ; GFX6-GISEL-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX6-GISEL-NEXT:   $m0 = S_MOV_B32 0
+  ; GFX6-GISEL-NEXT: {{  $}}
+  ; GFX6-GISEL-NEXT: bb.1:
+  ; GFX6-GISEL-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX6-GISEL-NEXT:   liveins: $vgpr0
+  ; GFX6-GISEL-NEXT: {{  $}}
+  ; GFX6-GISEL-NEXT:   S_SETREG_IMM32_B32 0, 515, implicit-def $mode, implicit $mode
+  ; GFX6-GISEL-NEXT:   BUNDLE implicit renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+  ; GFX6-GISEL-NEXT:     DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+  ; GFX6-GISEL-NEXT:     S_WAITCNT 0
+  ; GFX6-GISEL-NEXT:   }
+  ; GFX6-GISEL-NEXT:   renamable $sgpr0 = S_GETREG_B32 515, implicit $mode
+  ; GFX6-GISEL-NEXT:   S_CMP_LG_U32 killed renamable $sgpr0, 0, implicit-def $scc
+  ; GFX6-GISEL-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+  ; GFX6-GISEL-NEXT: {{  $}}
+  ; GFX6-GISEL-NEXT: bb.2:
+  ; GFX6-GISEL-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX9-SDAG-LABEL: name: gws_barrier_offset0
+  ; GFX9-SDAG: bb.0 (%ir-block.0):
+  ; GFX9-SDAG-NEXT:   liveins: $sgpr8_sgpr9
+  ; GFX9-SDAG-NEXT: {{  $}}
+  ; GFX9-SDAG-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset1, align 16, addrspace 4)
+  ; GFX9-SDAG-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-SDAG-NEXT:   $m0 = S_MOV_B32 0
+  ; GFX9-SDAG-NEXT:   BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+  ; GFX9-SDAG-NEXT:     DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+  ; GFX9-SDAG-NEXT:     S_WAITCNT 0
+  ; GFX9-SDAG-NEXT:   }
+  ; GFX9-SDAG-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX9-GISEL-LABEL: name: gws_barrier_offset0
+  ; GFX9-GISEL: bb.0 (%ir-block.0):
+  ; GFX9-GISEL-NEXT:   liveins: $sgpr8_sgpr9
+  ; GFX9-GISEL-NEXT: {{  $}}
+  ; GFX9-GISEL-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset1, align 16, addrspace 4)
+  ; GFX9-GISEL-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-GISEL-NEXT:   $m0 = S_MOV_B32 0
+  ; GFX9-GISEL-NEXT:   BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+  ; GFX9-GISEL-NEXT:     DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+  ; GFX9-GISEL-NEXT:     S_WAITCNT 0
+  ; GFX9-GISEL-NEXT:   }
+  ; GFX9-GISEL-NEXT:   S_ENDPGM 0
+  call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 {
+  ; GFX6-SDAG-LABEL: name: gws_barrier_offset63
+  ; GFX6-SDAG: bb.0 (%ir-block.0):
+  ; GFX6-SDAG-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX6-SDAG-NEXT:   liveins: $sgpr8_sgpr9
+  ; GFX6-SDAG-NEXT: {{  $}}
+  ; GFX6-SDAG-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset1, align 16, addrspace 4)
+  ; GFX6-SDAG-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX6-SDAG-NEXT:   $m0 = S_MOV_B32 0
+  ; GFX6-SDAG-NEXT: {{  $}}
+  ; GFX6-SDAG-NEXT: bb.1:
+  ; GFX6-SDAG-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX6-SDAG-NEXT:   liveins: $vgpr0
+  ; GFX6-SDAG-NEXT: {{  $}}
+  ; GFX6-SDAG-NEXT:   S_SETREG_IMM32_B32 0, 515, implicit-def $mode, implicit $mode
+  ; GFX6-SDAG-NEXT:   BUNDLE implicit renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+  ; GFX6-SDAG-NEXT:     DS_GWS_BARRIER renamable $vgpr0, 63, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+  ; GFX6-SDAG-NEXT:     S_WAITCNT 0
+  ; GFX6-SDAG-NEXT:   }
+  ; GFX6-SDAG-NEXT:   renamable $sgpr0 = S_GETREG_B32 515, implicit $mode
+  ; GFX6-SDAG-NEXT:   S_CMP_LG_U32 killed renamable $sgpr0, 0, implicit-def $scc
+  ; GFX6-SDAG-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+  ; GFX6-SDAG-NEXT: {{  $}}
+  ; GFX6-SDAG-NEXT: bb.2:
+  ; GFX6-SDAG-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX6-GISEL-LABEL: name: gws_barrier_offset63
+  ; GFX6-GISEL: bb.0 (%ir-block.0):
+  ; GFX6-GISEL-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX6-GISEL-NEXT:   liveins: $sgpr8_sgpr9
+  ; GFX6-GISEL-NEXT: {{  $}}
+  ; GFX6-GISEL-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset1, align 16, addrspace 4)
+  ; GFX6-GISEL-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX6-GISEL-NEXT:   $m0 = S_MOV_B32 0
+  ; GFX6-GISEL-NEXT: {{  $}}
+  ; GFX6-GISEL-NEXT: bb.1:
+  ; GFX6-GISEL-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX6-GISEL-NEXT:   liveins: $vgpr0
+  ; GFX6-GISEL-NEXT: {{  $}}
+  ; GFX6-GISEL-NEXT:   S_SETREG_IMM32_B32 0, 515, implicit-def $mode, implicit $mode
+  ; GFX6-GISEL-NEXT:   BUNDLE implicit renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+  ; GFX6-GISEL-NEXT:     DS_GWS_BARRIER renamable $vgpr0, 63, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+  ; GFX6-GISEL-NEXT:     S_WAITCNT 0
+  ; GFX6-GISEL-NEXT:   }
+  ; GFX6-GISEL-NEXT:   renamable $sgpr0 = S_GETREG_B32 515, implicit $mode
+  ; GFX6-GISEL-NEXT:   S_CMP_LG_U32 killed renamable $sgpr0, 0, implicit-def $scc
+  ; GFX6-GISEL-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+  ; GFX6-GISEL-NEXT: {{  $}}
+  ; GFX6-GISEL-NEXT: bb.2:
+  ; GFX6-GISEL-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX9-SDAG-LABEL: name: gws_barrier_offset63
+  ; GFX9-SDAG: bb.0 (%ir-block.0):
+  ; GFX9-SDAG-NEXT:   liveins: $sgpr8_sgpr9
+  ; GFX9-SDAG-NEXT: {{  $}}
+  ; GFX9-SDAG-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset1, align 16, addrspace 4)
+  ; GFX9-SDAG-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-SDAG-NEXT:   $m0 = S_MOV_B32 0
+  ; GFX9-SDAG-NEXT:   BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+  ; GFX9-SDAG-NEXT:     DS_GWS_BARRIER renamable $vgpr0, 63, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+  ; GFX9-SDAG-NEXT:     S_WAITCNT 0
+  ; GFX9-SDAG-NEXT:   }
+  ; GFX9-SDAG-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX9-GISEL-LABEL: name: gws_barrier_offset63
+  ; GFX9-GISEL: bb.0 (%ir-block.0):
+  ; GFX9-GISEL-NEXT:   liveins: $sgpr8_sgpr9
+  ; GFX9-GISEL-NEXT: {{  $}}
+  ; GFX9-GISEL-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset1, align 16, addrspace 4)
+  ; GFX9-GISEL-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-GISEL-NEXT:   $m0 = S_MOV_B32 0
+  ; GFX9-GISEL-NEXT:   BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+  ; GFX9-GISEL-NEXT:     DS_GWS_BARRIER renamable $vgpr0, 63, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+  ; GFX9-GISEL-NEXT:     S_WAITCNT 0
+  ; GFX9-GISEL-NEXT:   }
+  ; GFX9-GISEL-NEXT:   S_ENDPGM 0
+  call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63)
+  ret void
+}
+
+declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent inaccessiblememonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index 417b8e0..6218ba4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -1,113 +1,270 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,LOOP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s
-
-; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos.
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos < %s | FileCheck -check-prefix=MIR %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos < %s | FileCheck -check-prefix=MIR %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos < %s | FileCheck -check-prefix=MIR %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos < %s | FileCheck -check-prefix=MIR %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "load" --filter-out "store" --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GFX6,GFX6-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GFX6,GFX6-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
 
 ; Minimum offset
-; GCN-LABEL: {{^}}gws_barrier_offset0:
-; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]]
-; NOLOOP-DAG: s_mov_b32 m0, 0{{$}}
-; NOLOOP: v_mov_b32_e32 v0, [[BAR_NUM]]
-; NOLOOP: ds_gws_barrier v0 gds{{$}}
-
-; LOOP: s_mov_b32 m0, 0{{$}}
-; LOOP: [[LOOP:.LBB[0-9]+_[0-9]+]]:
-; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
-; LOOP-NEXT: ds_gws_barrier v0 gds
-; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1)
-; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0
-; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
-
-; MIR-LABEL: name: gws_barrier_offset0{{$}}
-; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec
-; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
-; MIR-NEXT: S_WAITCNT 0
-; MIR-NEXT: }
 define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
+; GFX6-LABEL: gws_barrier_offset0:
+; GFX6:  ; %bb.0:
+; GFX6:    s_mov_b32 m0, 0
+; GFX6:    s_waitcnt lgkmcnt(0)
+; GFX6:    v_mov_b32_e32 v0, s0
+; GFX6:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX6:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6:    ds_gws_barrier v0 gds
+; GFX6:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6:    s_cmp_lg_u32 s0, 0
+; GFX6:    s_cbranch_scc1 .LBB0_1
+; GFX6:  ; %bb.2:
+; GFX6:    s_endpgm
+;
+; GCN-LABEL: gws_barrier_offset0:
+; GCN:  ; %bb.0:
+; GCN:    s_mov_b32 m0, 0
+; GCN:    s_waitcnt lgkmcnt(0)
+; GCN:    v_mov_b32_e32 v0, s0
+; GCN:    ds_gws_barrier v0 gds
+; GCN:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN:    s_endpgm
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
   ret void
 }
 
-; MIR-LABEL: name: gws_barrier_offset63{{$}}
-
 ; Maximum offset
-; GCN-LABEL: {{^}}gws_barrier_offset63:
-; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]]
-; NOLOOP-DAG: s_mov_b32 m0, 0{{$}}
-; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
-; NOLOOP: ds_gws_barrier v0 offset:63 gds{{$}}
 define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 {
+; GFX6-LABEL: gws_barrier_offset63:
+; GFX6:  ; %bb.0:
+; GFX6:    s_mov_b32 m0, 0
+; GFX6:    s_waitcnt lgkmcnt(0)
+; GFX6:    v_mov_b32_e32 v0, s0
+; GFX6:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX6:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6:    ds_gws_barrier v0 offset:63 gds
+; GFX6:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6:    s_cmp_lg_u32 s0, 0
+; GFX6:    s_cbranch_scc1 .LBB1_1
+; GFX6:  ; %bb.2:
+; GFX6:    s_endpgm
+;
+; GCN-LABEL: gws_barrier_offset63:
+; GCN:  ; %bb.0:
+; GCN:    s_mov_b32 m0, 0
+; GCN:    s_waitcnt lgkmcnt(0)
+; GCN:    v_mov_b32_e32 v0, s0
+; GCN:    ds_gws_barrier v0 offset:63 gds
+; GCN:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN:    s_endpgm
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63)
   ret void
 }
 
 ; FIXME: Should be able to shift directly into m0
-; GCN-LABEL: {{^}}gws_barrier_sgpr_offset:
-; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]]
-
-; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16
-
-; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]]
-; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}}
 define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 {
+; GFX6-LABEL: gws_barrier_sgpr_offset:
+; GFX6:  ; %bb.0:
+; GFX6:    s_waitcnt lgkmcnt(0)
+; GFX6:    v_mov_b32_e32 v0, s0
+; GFX6:    s_lshl_b32 m0, s1, 16
+; GFX6:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX6:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6:    ds_gws_barrier v0 gds
+; GFX6:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6:    s_cmp_lg_u32 s0, 0
+; GFX6:    s_cbranch_scc1 .LBB2_1
+; GFX6:  ; %bb.2:
+; GFX6:    s_endpgm
+;
+; GFX9-SDAG-LABEL: gws_barrier_sgpr_offset:
+; GFX9-SDAG:  ; %bb.0:
+; GFX9-SDAG:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG:    s_lshl_b32 m0, s1, 16
+; GFX9-SDAG:    v_mov_b32_e32 v0, s0
+; GFX9-SDAG:    ds_gws_barrier v0 gds
+; GFX9-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG:    s_endpgm
+;
+; GFX9-GISEL-LABEL: gws_barrier_sgpr_offset:
+; GFX9-GISEL:  ; %bb.0:
+; GFX9-GISEL:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL:    s_lshl_b32 m0, s1, 16
+; GFX9-GISEL:    s_nop 0
+; GFX9-GISEL:    ds_gws_barrier v0 gds
+; GFX9-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL:    s_endpgm
+;
+; GFX1011-LABEL: gws_barrier_sgpr_offset:
+; GFX1011:  ; %bb.0:
+; GFX1011:    s_waitcnt lgkmcnt(0)
+; GFX1011:    v_mov_b32_e32 v0, s0
+; GFX1011:    s_lshl_b32 m0, s1, 16
+; GFX1011:    ds_gws_barrier v0 gds
+; GFX1011:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011:    s_endpgm
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
   ret void
 }
 
 ; Variable offset in SGPR with constant add
-; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1:
-; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]]
-
-; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16
-
-; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]]
-; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:1 gds{{$}}
 define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
+; GFX6-LABEL: gws_barrier_sgpr_offset_add1:
+; GFX6:  ; %bb.0:
+; GFX6:    s_waitcnt lgkmcnt(0)
+; GFX6:    v_mov_b32_e32 v0, s0
+; GFX6:    s_lshl_b32 m0, s1, 16
+; GFX6:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX6:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6:    ds_gws_barrier v0 offset:1 gds
+; GFX6:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6:    s_cmp_lg_u32 s0, 0
+; GFX6:    s_cbranch_scc1 .LBB3_1
+; GFX6:  ; %bb.2:
+; GFX6:    s_endpgm
+;
+; GFX9-SDAG-LABEL: gws_barrier_sgpr_offset_add1:
+; GFX9-SDAG:  ; %bb.0:
+; GFX9-SDAG:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG:    s_lshl_b32 m0, s1, 16
+; GFX9-SDAG:    v_mov_b32_e32 v0, s0
+; GFX9-SDAG:    ds_gws_barrier v0 offset:1 gds
+; GFX9-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG:    s_endpgm
+;
+; GFX9-GISEL-LABEL: gws_barrier_sgpr_offset_add1:
+; GFX9-GISEL:  ; %bb.0:
+; GFX9-GISEL:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL:    s_lshl_b32 m0, s1, 16
+; GFX9-GISEL:    s_nop 0
+; GFX9-GISEL:    ds_gws_barrier v0 offset:1 gds
+; GFX9-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL:    s_endpgm
+;
+; GFX1011-LABEL: gws_barrier_sgpr_offset_add1:
+; GFX1011:  ; %bb.0:
+; GFX1011:    s_waitcnt lgkmcnt(0)
+; GFX1011:    v_mov_b32_e32 v0, s0
+; GFX1011:    s_lshl_b32 m0, s1, 16
+; GFX1011:    ds_gws_barrier v0 offset:1 gds
+; GFX1011:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011:    s_endpgm
   %offset = add i32 %offset.base, 1
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
   ret void
 }
 
-; GCN-LABEL: {{^}}gws_barrier_vgpr_offset:
-; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]]
-; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
-
-; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16
-
-; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]]
-; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}}
 define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 {
+; GFX6-LABEL: gws_barrier_vgpr_offset:
+; GFX6:  ; %bb.0:
+; GFX6:    v_readfirstlane_b32 s1, v0
+; GFX6:    s_lshl_b32 m0, s1, 16
+; GFX6:    s_waitcnt lgkmcnt(0)
+; GFX6:    v_mov_b32_e32 v0, s0
+; GFX6:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX6:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6:    ds_gws_barrier v0 gds
+; GFX6:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6:    s_cmp_lg_u32 s0, 0
+; GFX6:    s_cbranch_scc1 .LBB4_1
+; GFX6:  ; %bb.2:
+; GFX6:    s_endpgm
+;
+; GFX9-LABEL: gws_barrier_vgpr_offset:
+; GFX9:  ; %bb.0:
+; GFX9:    v_readfirstlane_b32 s1, v0
+; GFX9:    s_lshl_b32 m0, s1, 16
+; GFX9:    s_waitcnt lgkmcnt(0)
+; GFX9:    v_mov_b32_e32 v0, s0
+; GFX9:    ds_gws_barrier v0 gds
+; GFX9:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9:    s_endpgm
+;
+; GFX10-LABEL: gws_barrier_vgpr_offset:
+; GFX10:  ; %bb.0:
+; GFX10:    v_readfirstlane_b32 s1, v0
+; GFX10:    s_lshl_b32 m0, s1, 16
+; GFX10:    s_waitcnt lgkmcnt(0)
+; GFX10:    v_mov_b32_e32 v0, s0
+; GFX10:    ds_gws_barrier v0 gds
+; GFX10:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10:    s_endpgm
+;
+; GFX11-LABEL: gws_barrier_vgpr_offset:
+; GFX11:  ; %bb.0:
+; GFX11:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11:    v_readfirstlane_b32 s1, v0
+; GFX11:    s_lshl_b32 m0, s1, 16
+; GFX11:    s_waitcnt lgkmcnt(0)
+; GFX11:    v_mov_b32_e32 v0, s0
+; GFX11:    ds_gws_barrier v0 gds
+; GFX11:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11:    s_endpgm
   %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
   ret void
 }
 
 ; Variable offset in VGPR with constant add
-; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add:
-; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]]
-; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
-
-; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16
-
-; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]]
-; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:3 gds{{$}}
 define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 {
+; GFX6-LABEL: gws_barrier_vgpr_offset_add:
+; GFX6:  ; %bb.0:
+; GFX6:    v_readfirstlane_b32 s1, v0
+; GFX6:    s_lshl_b32 m0, s1, 16
+; GFX6:    s_waitcnt lgkmcnt(0)
+; GFX6:    v_mov_b32_e32 v0, s0
+; GFX6:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX6:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6:    ds_gws_barrier v0 offset:3 gds
+; GFX6:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6:    s_cmp_lg_u32 s0, 0
+; GFX6:    s_cbranch_scc1 .LBB5_1
+; GFX6:  ; %bb.2:
+; GFX6:    s_endpgm
+;
+; GFX9-LABEL: gws_barrier_vgpr_offset_add:
+; GFX9:  ; %bb.0:
+; GFX9:    v_readfirstlane_b32 s1, v0
+; GFX9:    s_lshl_b32 m0, s1, 16
+; GFX9:    s_waitcnt lgkmcnt(0)
+; GFX9:    v_mov_b32_e32 v0, s0
+; GFX9:    ds_gws_barrier v0 offset:3 gds
+; GFX9:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9:    s_endpgm
+;
+; GFX10-LABEL: gws_barrier_vgpr_offset_add:
+; GFX10:  ; %bb.0:
+; GFX10:    v_readfirstlane_b32 s1, v0
+; GFX10:    s_lshl_b32 m0, s1, 16
+; GFX10:    s_waitcnt lgkmcnt(0)
+; GFX10:    v_mov_b32_e32 v0, s0
+; GFX10:    ds_gws_barrier v0 offset:3 gds
+; GFX10:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10:    s_endpgm
+;
+; GFX11-LABEL: gws_barrier_vgpr_offset_add:
+; GFX11:  ; %bb.0:
+; GFX11:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11:    v_readfirstlane_b32 s1, v0
+; GFX11:    s_lshl_b32 m0, s1, 16
+; GFX11:    s_waitcnt lgkmcnt(0)
+; GFX11:    v_mov_b32_e32 v0, s0
+; GFX11:    ds_gws_barrier v0 offset:3 gds
+; GFX11:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11:    s_endpgm
   %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
   %vgpr.offset = add i32 %vgpr.offset.base, 3
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
@@ -117,20 +274,90 @@
 @lds = internal unnamed_addr addrspace(3) global i32 poison
 
 ; Check if m0 initialization is shared
-; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset:
-; NOLOOP: s_mov_b32 m0, 0
-; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:10 gds
-
-; LOOP: s_mov_b32 m0, -1
-; LOOP: ds_write_b32
-; LOOP: s_mov_b32 m0, 0
-; LOOP: s_setreg_imm32_b32
-; LOOP: ds_gws_barrier v{{[0-9]+}} offset:10 gds
-; LOOP: s_cbranch_scc1
-
-; LOOP: s_mov_b32 m0, -1
-; LOOP: ds_write_b32
 define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 {
+; GFX6-SDAG-LABEL: gws_barrier_save_m0_barrier_constant_offset:
+; GFX6-SDAG:  ; %bb.0:
+; GFX6-SDAG:    v_mov_b32_e32 v1, 1
+; GFX6-SDAG:    v_mov_b32_e32 v0, 0
+; GFX6-SDAG:    s_mov_b32 m0, -1
+; GFX6-SDAG:    ds_write_b32 v0, v1
+; GFX6-SDAG:    s_waitcnt lgkmcnt(0)
+; GFX6-SDAG:    v_mov_b32_e32 v1, s0
+; GFX6-SDAG:    s_mov_b32 m0, 0
+; GFX6-SDAG:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-SDAG:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-SDAG:    ds_gws_barrier v1 offset:10 gds
+; GFX6-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-SDAG:    s_cmp_lg_u32 s0, 0
+; GFX6-SDAG:    s_cbranch_scc1 .LBB6_1
+; GFX6-SDAG:  ; %bb.2:
+; GFX6-SDAG:    v_mov_b32_e32 v1, 2
+; GFX6-SDAG:    s_mov_b32 m0, -1
+; GFX6-SDAG:    ds_write_b32 v0, v1
+; GFX6-SDAG:    s_endpgm
+;
+; GFX6-GISEL-LABEL: gws_barrier_save_m0_barrier_constant_offset:
+; GFX6-GISEL:  ; %bb.0:
+; GFX6-GISEL:    v_mov_b32_e32 v0, 1
+; GFX6-GISEL:    v_mov_b32_e32 v1, 0
+; GFX6-GISEL:    s_mov_b32 m0, -1
+; GFX6-GISEL:    ds_write_b32 v1, v0
+; GFX6-GISEL:    s_waitcnt lgkmcnt(0)
+; GFX6-GISEL:    v_mov_b32_e32 v0, s0
+; GFX6-GISEL:    s_mov_b32 m0, 0
+; GFX6-GISEL:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-GISEL:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-GISEL:    ds_gws_barrier v0 offset:10 gds
+; GFX6-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-GISEL:    s_cmp_lg_u32 s0, 0
+; GFX6-GISEL:    s_cbranch_scc1 .LBB6_1
+; GFX6-GISEL:  ; %bb.2:
+; GFX6-GISEL:    v_mov_b32_e32 v0, 2
+; GFX6-GISEL:    v_mov_b32_e32 v1, 0
+; GFX6-GISEL:    s_mov_b32 m0, -1
+; GFX6-GISEL:    ds_write_b32 v1, v0
+; GFX6-GISEL:    s_endpgm
+;
+; GFX9-LABEL: gws_barrier_save_m0_barrier_constant_offset:
+; GFX9:  ; %bb.0:
+; GFX9:    v_mov_b32_e32 v0, 1
+; GFX9:    v_mov_b32_e32 v1, 0
+; GFX9:    ds_write_b32 v1, v0
+; GFX9:    s_mov_b32 m0, 0
+; GFX9:    s_waitcnt lgkmcnt(0)
+; GFX9:    v_mov_b32_e32 v0, s0
+; GFX9:    ds_gws_barrier v0 offset:10 gds
+; GFX9:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9:    v_mov_b32_e32 v0, 2
+; GFX9:    ds_write_b32 v1, v0
+; GFX9:    s_endpgm
+;
+; GFX10-LABEL: gws_barrier_save_m0_barrier_constant_offset:
+; GFX10:  ; %bb.0:
+; GFX10:    v_mov_b32_e32 v0, 1
+; GFX10:    v_mov_b32_e32 v1, 0
+; GFX10:    s_mov_b32 m0, 0
+; GFX10:    ds_write_b32 v1, v0
+; GFX10:    v_mov_b32_e32 v0, 2
+; GFX10:    s_waitcnt lgkmcnt(0)
+; GFX10:    v_mov_b32_e32 v2, s0
+; GFX10:    ds_gws_barrier v2 offset:10 gds
+; GFX10:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10:    ds_write_b32 v1, v0
+; GFX10:    s_endpgm
+;
+; GFX11-LABEL: gws_barrier_save_m0_barrier_constant_offset:
+; GFX11:  ; %bb.0:
+; GFX11:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 0
+; GFX11:    s_mov_b32 m0, 0
+; GFX11:    v_mov_b32_e32 v0, 2
+; GFX11:    s_waitcnt lgkmcnt(0)
+; GFX11:    v_mov_b32_e32 v2, s0
+; GFX11:    ds_gws_barrier v2 offset:10 gds
+; GFX11:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11:    s_endpgm
   store i32 1, ptr addrspace(3) @lds
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10)
   store i32 2, ptr addrspace(3) @lds
@@ -138,46 +365,256 @@
 }
 
 ; Make sure this increments lgkmcnt
-; GCN-LABEL: {{^}}gws_barrier_lgkmcnt:
-; NOLOOP: s_mov_b32 m0, 0{{$}}
-; NOLOOP: ds_gws_barrier v0 gds{{$}}
-; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; NOLOOP-NEXT: s_setpc_b64
 define void @gws_barrier_lgkmcnt(i32 %val) {
+; GFX6-LABEL: gws_barrier_lgkmcnt:
+; GFX6:  ; %bb.0:
+; GFX6:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6:    s_mov_b32 m0, 0
+; GFX6:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX6:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6:    ds_gws_barrier v0 gds
+; GFX6:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6:    s_getreg_b32 s4, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6:    s_cmp_lg_u32 s4, 0
+; GFX6:    s_cbranch_scc1 .LBB7_1
+; GFX6:  ; %bb.2:
+; GFX6:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: gws_barrier_lgkmcnt:
+; GFX9:  ; %bb.0:
+; GFX9:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9:    s_mov_b32 m0, 0
+; GFX9:    s_nop 0
+; GFX9:    ds_gws_barrier v0 gds
+; GFX9:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: gws_barrier_lgkmcnt:
+; GFX1011:  ; %bb.0:
+; GFX1011:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011:    s_mov_b32 m0, 0
+; GFX1011:    ds_gws_barrier v0 gds
+; GFX1011:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011:    s_setpc_b64 s[30:31]
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
   ret void
 }
 
 ; Does not imply memory fence on its own
-; GCN-LABEL: {{^}}gws_barrier_wait_before:
-; NOLOOP: s_waitcnt
-; NOLOOP-NOT: s_waitcnt{{$}}
 define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, ptr addrspace(1) %ptr) #0 {
+; GFX6-SDAG-LABEL: gws_barrier_wait_before:
+; GFX6-SDAG:  ; %bb.0:
+; GFX6-SDAG:    s_mov_b32 s3, 0x100f000
+; GFX6-SDAG:    s_mov_b32 s2, -1
+; GFX6-SDAG:    v_mov_b32_e32 v0, 0
+; GFX6-SDAG:    s_waitcnt lgkmcnt(0)
+; GFX6-SDAG:    s_waitcnt expcnt(0)
+; GFX6-SDAG:    v_mov_b32_e32 v0, s4
+; GFX6-SDAG:    s_mov_b32 m0, 0
+; GFX6-SDAG:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-SDAG:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-SDAG:    ds_gws_barrier v0 offset:7 gds
+; GFX6-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-SDAG:    s_cmp_lg_u32 s0, 0
+; GFX6-SDAG:    s_cbranch_scc1 .LBB8_1
+; GFX6-SDAG:  ; %bb.2:
+; GFX6-SDAG:    s_endpgm
+;
+; GFX6-GISEL-LABEL: gws_barrier_wait_before:
+; GFX6-GISEL:  ; %bb.0:
+; GFX6-GISEL:    v_mov_b32_e32 v0, 0
+; GFX6-GISEL:    s_mov_b32 s2, -1
+; GFX6-GISEL:    s_mov_b32 s3, 0x100f000
+; GFX6-GISEL:    s_waitcnt lgkmcnt(0)
+; GFX6-GISEL:    s_waitcnt expcnt(0)
+; GFX6-GISEL:    v_mov_b32_e32 v0, s4
+; GFX6-GISEL:    s_mov_b32 m0, 0
+; GFX6-GISEL:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-GISEL:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-GISEL:    ds_gws_barrier v0 offset:7 gds
+; GFX6-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-GISEL:    s_cmp_lg_u32 s0, 0
+; GFX6-GISEL:    s_cbranch_scc1 .LBB8_1
+; GFX6-GISEL:  ; %bb.2:
+; GFX6-GISEL:    s_endpgm
+;
+; GFX9-LABEL: gws_barrier_wait_before:
+; GFX9:  ; %bb.0:
+; GFX9:    v_mov_b32_e32 v0, 0
+; GFX9:    s_mov_b32 m0, 0
+; GFX9:    s_waitcnt lgkmcnt(0)
+; GFX9:    v_mov_b32_e32 v0, s2
+; GFX9:    ds_gws_barrier v0 offset:7 gds
+; GFX9:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9:    s_endpgm
+;
+; GFX1011-LABEL: gws_barrier_wait_before:
+; GFX1011:  ; %bb.0:
+; GFX1011:    s_clause 0x1
+; GFX1011:    v_mov_b32_e32 v0, 0
+; GFX1011:    s_mov_b32 m0, 0
+; GFX1011:    s_waitcnt lgkmcnt(0)
+; GFX1011:    v_mov_b32_e32 v1, s2
+; GFX1011:    ds_gws_barrier v1 offset:7 gds
+; GFX1011:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011:    s_endpgm
   store i32 0, ptr addrspace(1) %ptr
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
   ret void
 }
 
-; GCN-LABEL: {{^}}gws_barrier_wait_after:
-; NOLOOP: s_mov_b32 m0, 0{{$}}
-; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds
-; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; NOLOOP: load_{{dword|b32}}
 define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, ptr addrspace(1) %ptr) #0 {
+; GFX6-SDAG-LABEL: gws_barrier_wait_after:
+; GFX6-SDAG:  ; %bb.0:
+; GFX6-SDAG:    s_mov_b32 s3, 0x100f000
+; GFX6-SDAG:    s_mov_b32 s2, -1
+; GFX6-SDAG:    s_mov_b32 m0, 0
+; GFX6-SDAG:    s_waitcnt lgkmcnt(0)
+; GFX6-SDAG:    v_mov_b32_e32 v0, s4
+; GFX6-SDAG:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-SDAG:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-SDAG:    ds_gws_barrier v0 offset:7 gds
+; GFX6-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG:    s_getreg_b32 s4, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-SDAG:    s_cmp_lg_u32 s4, 0
+; GFX6-SDAG:    s_cbranch_scc1 .LBB9_1
+; GFX6-SDAG:  ; %bb.2:
+; GFX6-SDAG:    s_waitcnt vmcnt(0)
+; GFX6-SDAG:    s_endpgm
+;
+; GFX6-GISEL-LABEL: gws_barrier_wait_after:
+; GFX6-GISEL:  ; %bb.0:
+; GFX6-GISEL:    s_mov_b32 m0, 0
+; GFX6-GISEL:    s_waitcnt lgkmcnt(0)
+; GFX6-GISEL:    v_mov_b32_e32 v0, s2
+; GFX6-GISEL:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-GISEL:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-GISEL:    ds_gws_barrier v0 offset:7 gds
+; GFX6-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL:    s_getreg_b32 s2, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-GISEL:    s_cmp_lg_u32 s2, 0
+; GFX6-GISEL:    s_cbranch_scc1 .LBB9_1
+; GFX6-GISEL:  ; %bb.2:
+; GFX6-GISEL:    s_mov_b32 s2, -1
+; GFX6-GISEL:    s_mov_b32 s3, 0x100f000
+; GFX6-GISEL:    s_waitcnt vmcnt(0)
+; GFX6-GISEL:    s_endpgm
+;
+; GFX9-SDAG-LABEL: gws_barrier_wait_after:
+; GFX9-SDAG:  ; %bb.0:
+; GFX9-SDAG:    s_mov_b32 m0, 0
+; GFX9-SDAG:    v_mov_b32_e32 v0, 0
+; GFX9-SDAG:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG:    v_mov_b32_e32 v1, s2
+; GFX9-SDAG:    ds_gws_barrier v1 offset:7 gds
+; GFX9-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG:    s_waitcnt vmcnt(0)
+; GFX9-SDAG:    s_endpgm
+;
+; GFX9-GISEL-LABEL: gws_barrier_wait_after:
+; GFX9-GISEL:  ; %bb.0:
+; GFX9-GISEL:    s_mov_b32 m0, 0
+; GFX9-GISEL:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL:    ds_gws_barrier v0 offset:7 gds
+; GFX9-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL:    s_waitcnt vmcnt(0)
+; GFX9-GISEL:    s_endpgm
+;
+; GFX10-LABEL: gws_barrier_wait_after:
+; GFX10:  ; %bb.0:
+; GFX10:    s_clause 0x1
+; GFX10:    v_mov_b32_e32 v1, 0
+; GFX10:    s_mov_b32 m0, 0
+; GFX10:    s_waitcnt lgkmcnt(0)
+; GFX10:    v_mov_b32_e32 v0, s2
+; GFX10:    ds_gws_barrier v0 offset:7 gds
+; GFX10:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10:    s_waitcnt vmcnt(0)
+; GFX10:    s_endpgm
+;
+; GFX11-LABEL: gws_barrier_wait_after:
+; GFX11:  ; %bb.0:
+; GFX11:    s_clause 0x1
+; GFX11:    s_mov_b32 m0, 0
+; GFX11:    s_waitcnt lgkmcnt(0)
+; GFX11:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11:    ds_gws_barrier v0 offset:7 gds
+; GFX11:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11:    s_waitcnt vmcnt(0)
+; GFX11:    s_endpgm
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
   %load = load volatile i32, ptr addrspace(1) %ptr
   ret void
 }
 
 ; Does not imply memory fence on its own
-; GCN-LABEL: {{^}}gws_barrier_fence_before:
-; NOLOOP: s_mov_b32 m0, 0{{$}}
-; NOLOOP: store_{{dword|b32}}
-; GFX9: s_waitcnt vmcnt(0)
-; GFX10: s_waitcnt_vscnt null, 0x0
-; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds
-; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, ptr addrspace(1) %ptr) #0 {
+; GFX6-SDAG-LABEL: gws_barrier_fence_before:
+; GFX6-SDAG:  ; %bb.0:
+; GFX6-SDAG:    s_mov_b32 s3, 0x100f000
+; GFX6-SDAG:    s_mov_b32 s2, -1
+; GFX6-SDAG:    v_mov_b32_e32 v0, 0
+; GFX6-SDAG:    s_waitcnt lgkmcnt(0)
+; GFX6-SDAG:    s_waitcnt expcnt(0)
+; GFX6-SDAG:    v_mov_b32_e32 v0, s4
+; GFX6-SDAG:    s_waitcnt vmcnt(0)
+; GFX6-SDAG:    s_mov_b32 m0, 0
+; GFX6-SDAG:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-SDAG:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-SDAG:    ds_gws_barrier v0 offset:7 gds
+; GFX6-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-SDAG:    s_cmp_lg_u32 s0, 0
+; GFX6-SDAG:    s_cbranch_scc1 .LBB10_1
+; GFX6-SDAG:  ; %bb.2:
+; GFX6-SDAG:    s_endpgm
+;
+; GFX6-GISEL-LABEL: gws_barrier_fence_before:
+; GFX6-GISEL:  ; %bb.0:
+; GFX6-GISEL:    v_mov_b32_e32 v0, 0
+; GFX6-GISEL:    s_mov_b32 s2, -1
+; GFX6-GISEL:    s_mov_b32 s3, 0x100f000
+; GFX6-GISEL:    s_waitcnt lgkmcnt(0)
+; GFX6-GISEL:    s_waitcnt expcnt(0)
+; GFX6-GISEL:    v_mov_b32_e32 v0, s4
+; GFX6-GISEL:    s_waitcnt vmcnt(0)
+; GFX6-GISEL:    s_mov_b32 m0, 0
+; GFX6-GISEL:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-GISEL:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-GISEL:    ds_gws_barrier v0 offset:7 gds
+; GFX6-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-GISEL:    s_cmp_lg_u32 s0, 0
+; GFX6-GISEL:    s_cbranch_scc1 .LBB10_1
+; GFX6-GISEL:  ; %bb.2:
+; GFX6-GISEL:    s_endpgm
+;
+; GFX9-LABEL: gws_barrier_fence_before:
+; GFX9:  ; %bb.0:
+; GFX9:    v_mov_b32_e32 v0, 0
+; GFX9:    s_mov_b32 m0, 0
+; GFX9:    s_waitcnt lgkmcnt(0)
+; GFX9:    v_mov_b32_e32 v0, s2
+; GFX9:    s_waitcnt vmcnt(0)
+; GFX9:    ds_gws_barrier v0 offset:7 gds
+; GFX9:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9:    s_endpgm
+;
+; GFX1011-LABEL: gws_barrier_fence_before:
+; GFX1011:  ; %bb.0:
+; GFX1011:    s_clause 0x1
+; GFX1011:    v_mov_b32_e32 v0, 0
+; GFX1011:    s_mov_b32 m0, 0
+; GFX1011:    s_waitcnt lgkmcnt(0)
+; GFX1011:    v_mov_b32_e32 v1, s2
+; GFX1011:    s_waitcnt_vscnt null, 0x0
+; GFX1011:    ds_gws_barrier v1 offset:7 gds
+; GFX1011:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011:    s_endpgm
   store i32 0, ptr addrspace(1) %ptr
   fence release
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
@@ -185,13 +622,76 @@
 }
 
 ; FIXME: Extra waitcnt
-; GCN-LABEL: {{^}}gws_barrier_fence_after:
-; NOLOOP: s_mov_b32 m0, 0{{$}}
-; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds
-; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; NOLOOP-NEXT: load_{{dword|b32}}
-
 define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, ptr addrspace(1) %ptr) #0 {
+; GFX6-SDAG-LABEL: gws_barrier_fence_after:
+; GFX6-SDAG:  ; %bb.0:
+; GFX6-SDAG:    s_mov_b32 s3, 0x100f000
+; GFX6-SDAG:    s_mov_b32 s2, -1
+; GFX6-SDAG:    s_mov_b32 m0, 0
+; GFX6-SDAG:    s_waitcnt lgkmcnt(0)
+; GFX6-SDAG:    v_mov_b32_e32 v0, s4
+; GFX6-SDAG:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-SDAG:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-SDAG:    ds_gws_barrier v0 offset:7 gds
+; GFX6-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG:    s_getreg_b32 s4, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-SDAG:    s_cmp_lg_u32 s4, 0
+; GFX6-SDAG:    s_cbranch_scc1 .LBB11_1
+; GFX6-SDAG:  ; %bb.2:
+; GFX6-SDAG:    s_waitcnt vmcnt(0)
+; GFX6-SDAG:    s_endpgm
+;
+; GFX6-GISEL-LABEL: gws_barrier_fence_after:
+; GFX6-GISEL:  ; %bb.0:
+; GFX6-GISEL:    s_mov_b32 m0, 0
+; GFX6-GISEL:    s_waitcnt lgkmcnt(0)
+; GFX6-GISEL:    v_mov_b32_e32 v0, s2
+; GFX6-GISEL:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-GISEL:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-GISEL:    ds_gws_barrier v0 offset:7 gds
+; GFX6-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL:    s_getreg_b32 s2, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-GISEL:    s_cmp_lg_u32 s2, 0
+; GFX6-GISEL:    s_cbranch_scc1 .LBB11_1
+; GFX6-GISEL:  ; %bb.2:
+; GFX6-GISEL:    s_mov_b32 s2, -1
+; GFX6-GISEL:    s_mov_b32 s3, 0x100f000
+; GFX6-GISEL:    s_waitcnt vmcnt(0)
+; GFX6-GISEL:    s_endpgm
+;
+; GFX9-LABEL: gws_barrier_fence_after:
+; GFX9:  ; %bb.0:
+; GFX9:    v_mov_b32_e32 v0, 0
+; GFX9:    s_mov_b32 m0, 0
+; GFX9:    s_waitcnt lgkmcnt(0)
+; GFX9:    v_mov_b32_e32 v1, s2
+; GFX9:    ds_gws_barrier v1 offset:7 gds
+; GFX9:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9:    s_waitcnt vmcnt(0)
+; GFX9:    s_endpgm
+;
+; GFX10-LABEL: gws_barrier_fence_after:
+; GFX10:  ; %bb.0:
+; GFX10:    s_clause 0x1
+; GFX10:    v_mov_b32_e32 v1, 0
+; GFX10:    s_mov_b32 m0, 0
+; GFX10:    s_waitcnt lgkmcnt(0)
+; GFX10:    v_mov_b32_e32 v0, s2
+; GFX10:    ds_gws_barrier v0 offset:7 gds
+; GFX10:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10:    s_waitcnt vmcnt(0)
+; GFX10:    s_endpgm
+;
+; GFX11-LABEL: gws_barrier_fence_after:
+; GFX11:  ; %bb.0:
+; GFX11:    s_clause 0x1
+; GFX11:    s_mov_b32 m0, 0
+; GFX11:    s_waitcnt lgkmcnt(0)
+; GFX11:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11:    ds_gws_barrier v0 offset:7 gds
+; GFX11:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11:    s_waitcnt vmcnt(0)
+; GFX11:    s_endpgm
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
   fence release
   %load = load volatile i32, ptr addrspace(1) %ptr
@@ -199,26 +699,132 @@
 }
 
 ; FIXME: Should a wait be inserted here, or is an explicit fence needed?
-; GCN-LABEL: {{^}}gws_init_barrier:
-; NOLOOP: s_mov_b32 m0, 0
-; NOLOOP: ds_gws_init v0 offset:7 gds
-; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; NOLOOP-NEXT: ds_gws_barrier v0 offset:7 gds
-; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
+; GFX6-SDAG-LABEL: gws_init_barrier:
+; GFX6-SDAG:  ; %bb.0:
+; GFX6-SDAG:    s_mov_b32 m0, 0
+; GFX6-SDAG:    s_waitcnt lgkmcnt(0)
+; GFX6-SDAG:    v_mov_b32_e32 v0, s0
+; GFX6-SDAG:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-SDAG:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-SDAG:    ds_gws_init v0 offset:7 gds
+; GFX6-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG:    s_getreg_b32 s1, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-SDAG:    s_cmp_lg_u32 s1, 0
+; GFX6-SDAG:    s_cbranch_scc1 .LBB12_1
+; GFX6-SDAG:  ; %bb.2:
+; GFX6-SDAG:    v_mov_b32_e32 v0, s0
+; GFX6-SDAG:  .LBB12_3: ; =>This Inner Loop Header: Depth=1
+; GFX6-SDAG:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-SDAG:    ds_gws_barrier v0 offset:7 gds
+; GFX6-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-SDAG:    s_cmp_lg_u32 s0, 0
+; GFX6-SDAG:    s_cbranch_scc1 .LBB12_3
+; GFX6-SDAG:  ; %bb.4:
+; GFX6-SDAG:    s_endpgm
+;
+; GFX6-GISEL-LABEL: gws_init_barrier:
+; GFX6-GISEL:  ; %bb.0:
+; GFX6-GISEL:    s_mov_b32 m0, 0
+; GFX6-GISEL:    s_waitcnt lgkmcnt(0)
+; GFX6-GISEL:    v_mov_b32_e32 v0, s0
+; GFX6-GISEL:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-GISEL:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-GISEL:    ds_gws_init v0 offset:7 gds
+; GFX6-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL:    s_getreg_b32 s1, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-GISEL:    s_cmp_lg_u32 s1, 0
+; GFX6-GISEL:    s_cbranch_scc1 .LBB12_1
+; GFX6-GISEL:  ; %bb.2:
+; GFX6-GISEL:    v_mov_b32_e32 v0, s0
+; GFX6-GISEL:    s_mov_b32 m0, 0
+; GFX6-GISEL:  .LBB12_3: ; =>This Inner Loop Header: Depth=1
+; GFX6-GISEL:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-GISEL:    ds_gws_barrier v0 offset:7 gds
+; GFX6-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-GISEL:    s_cmp_lg_u32 s0, 0
+; GFX6-GISEL:    s_cbranch_scc1 .LBB12_3
+; GFX6-GISEL:  ; %bb.4:
+; GFX6-GISEL:    s_endpgm
+;
+; GCN-LABEL: gws_init_barrier:
+; GCN:  ; %bb.0:
+; GCN:    s_mov_b32 m0, 0
+; GCN:    s_waitcnt lgkmcnt(0)
+; GCN:    v_mov_b32_e32 v0, s0
+; GCN:    ds_gws_init v0 offset:7 gds
+; GCN:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN:    ds_gws_barrier v0 offset:7 gds
+; GCN:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN:    s_endpgm
   call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
   ret void
 }
 
 ; FIXME: Why vmcnt, not expcnt?
-; GCN-LABEL: {{^}}gws_init_fence_barrier:
-; NOLOOP: s_mov_b32 m0, 0
-; NOLOOP: ds_gws_init v0 offset:7 gds
-; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; NOLOOP-NEXT: ds_gws_barrier v0 offset:7 gds
-; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 {
+; GFX6-SDAG-LABEL: gws_init_fence_barrier:
+; GFX6-SDAG:  ; %bb.0:
+; GFX6-SDAG:    s_mov_b32 m0, 0
+; GFX6-SDAG:    s_waitcnt lgkmcnt(0)
+; GFX6-SDAG:    v_mov_b32_e32 v0, s0
+; GFX6-SDAG:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-SDAG:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-SDAG:    ds_gws_init v0 offset:7 gds
+; GFX6-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG:    s_getreg_b32 s1, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-SDAG:    s_cmp_lg_u32 s1, 0
+; GFX6-SDAG:    s_cbranch_scc1 .LBB13_1
+; GFX6-SDAG:  ; %bb.2:
+; GFX6-SDAG:    v_mov_b32_e32 v0, s0
+; GFX6-SDAG:  .LBB13_3: ; =>This Inner Loop Header: Depth=1
+; GFX6-SDAG:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-SDAG:    ds_gws_barrier v0 offset:7 gds
+; GFX6-SDAG:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-SDAG:    s_cmp_lg_u32 s0, 0
+; GFX6-SDAG:    s_cbranch_scc1 .LBB13_3
+; GFX6-SDAG:  ; %bb.4:
+; GFX6-SDAG:    s_endpgm
+;
+; GFX6-GISEL-LABEL: gws_init_fence_barrier:
+; GFX6-GISEL:  ; %bb.0:
+; GFX6-GISEL:    s_mov_b32 m0, 0
+; GFX6-GISEL:    s_waitcnt lgkmcnt(0)
+; GFX6-GISEL:    v_mov_b32_e32 v0, s0
+; GFX6-GISEL:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-GISEL:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-GISEL:    ds_gws_init v0 offset:7 gds
+; GFX6-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL:    s_getreg_b32 s1, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-GISEL:    s_cmp_lg_u32 s1, 0
+; GFX6-GISEL:    s_cbranch_scc1 .LBB13_1
+; GFX6-GISEL:  ; %bb.2:
+; GFX6-GISEL:    v_mov_b32_e32 v0, s0
+; GFX6-GISEL:    s_mov_b32 m0, 0
+; GFX6-GISEL:  .LBB13_3: ; =>This Inner Loop Header: Depth=1
+; GFX6-GISEL:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
+; GFX6-GISEL:    ds_gws_barrier v0 offset:7 gds
+; GFX6-GISEL:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL:    s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
+; GFX6-GISEL:    s_cmp_lg_u32 s0, 0
+; GFX6-GISEL:    s_cbranch_scc1 .LBB13_3
+; GFX6-GISEL:  ; %bb.4:
+; GFX6-GISEL:    s_endpgm
+;
+; GCN-LABEL: gws_init_fence_barrier:
+; GCN:  ; %bb.0:
+; GCN:    s_mov_b32 m0, 0
+; GCN:    s_waitcnt lgkmcnt(0)
+; GCN:    v_mov_b32_e32 v0, s0
+; GCN:    ds_gws_init v0 offset:7 gds
+; GCN:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN:    ds_gws_barrier v0 offset:7 gds
+; GCN:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN:    s_endpgm
   call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
   fence release
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)