blob: 85ccf0c6007eb99e36876781411438fbc76ee85b [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX9
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX942
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX10,GFX10-W32
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX10,GFX10-W64
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX12,GFX12-W32
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX12,GFX12-W64
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX1250
define amdgpu_kernel void @wg_fence_acq_rel_single32() #0 {
; GFX9-LABEL: name: wg_fence_acq_rel_single32
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_fence_acq_rel_single32
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: wg_fence_acq_rel_single32
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-LABEL: name: wg_fence_acq_rel_single32
; GFX12: bb.0 (%ir-block.0):
; GFX12-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_fence_acq_rel_single32
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: S_ENDPGM 0
fence syncscope("workgroup") acq_rel
ret void
}
define amdgpu_kernel void @wg_fence_acq_rel_single64() #1 {
; GFX9-LABEL: name: wg_fence_acq_rel_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_fence_acq_rel_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_fence_acq_rel_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_fence_acq_rel_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_fence_acq_rel_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_fence_acq_rel_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_fence_acq_rel_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
fence syncscope("workgroup") acq_rel
ret void
}
define amdgpu_kernel void @wg_fence_acq_rel_multi() #2 {
; GFX9-LABEL: name: wg_fence_acq_rel_multi
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX9-NEXT: S_WAITCNT_lds_direct
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_fence_acq_rel_multi
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX942-NEXT: S_WAITCNT_lds_direct
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: wg_fence_acq_rel_multi
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-NEXT: S_WAITCNT_lds_direct
; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-LABEL: name: wg_fence_acq_rel_multi
; GFX12: bb.0 (%ir-block.0):
; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_fence_acq_rel_multi
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
fence syncscope("workgroup") acq_rel
ret void
}
define amdgpu_kernel void @wg_fence_acquire_single64() #1 {
; GFX9-LABEL: name: wg_fence_acquire_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_fence_acquire_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_fence_acquire_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_fence_acquire_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_fence_acquire_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_fence_acquire_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_fence_acquire_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
fence syncscope("workgroup") acquire
ret void
}
define amdgpu_kernel void @wg_fence_release_single64() #1 {
; GFX9-LABEL: name: wg_fence_release_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_fence_release_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_fence_release_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_fence_release_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_fence_release_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_fence_release_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_fence_release_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
fence syncscope("workgroup") release
ret void
}
define amdgpu_kernel void @wg_fence_seq_cst_single64() #1 {
; GFX9-LABEL: name: wg_fence_seq_cst_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_fence_seq_cst_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_fence_seq_cst_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_fence_seq_cst_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_fence_seq_cst_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_fence_seq_cst_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_fence_seq_cst_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
fence syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @wg_ld_seq_cst_single32(ptr addrspace(1) %p) #0 {
; GFX9-LABEL: name: wg_ld_seq_cst_single32
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_ld_seq_cst_single32
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: wg_ld_seq_cst_single32
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: liveins: $sgpr4_sgpr5
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-LABEL: name: wg_ld_seq_cst_single32
; GFX12: bb.0 (%ir-block.0):
; GFX12-NEXT: liveins: $sgpr4_sgpr5
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX12-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_ld_seq_cst_single32
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_ENDPGM 0
%v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
ret void
}
define amdgpu_kernel void @wg_ld_seq_cst_single64(ptr addrspace(1) %p) #1 {
; GFX9-LABEL: name: wg_ld_seq_cst_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_ld_seq_cst_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_ld_seq_cst_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_ld_seq_cst_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_ld_seq_cst_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_ld_seq_cst_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_ld_seq_cst_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
ret void
}
define amdgpu_kernel void @wg_ld_seq_cst_multi(ptr addrspace(1) %p) #2 {
; GFX9-LABEL: name: wg_ld_seq_cst_multi
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX9-NEXT: S_WAITCNT_lds_direct
; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_ld_seq_cst_multi
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX942-NEXT: S_WAITCNT_lds_direct
; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: wg_ld_seq_cst_multi
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: liveins: $sgpr4_sgpr5
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-NEXT: S_WAITCNT_lds_direct
; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX10-NEXT: S_WAITCNT_soft .Vmcnt_0
; GFX10-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-LABEL: name: wg_ld_seq_cst_multi
; GFX12: bb.0 (%ir-block.0):
; GFX12-NEXT: liveins: $sgpr4_sgpr5
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_ld_seq_cst_multi
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
ret void
}
define amdgpu_kernel void @wg_ld_acquire_single64(ptr addrspace(1) %p) #1 {
; GFX9-LABEL: name: wg_ld_acquire_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_ld_acquire_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_ld_acquire_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_ld_acquire_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_ld_acquire_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_ld_acquire_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_ld_acquire_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") acquire, align 4
ret void
}
define amdgpu_kernel void @wg_ld_monotonic_single64(ptr addrspace(1) %p) #1 {
; GFX9-LABEL: name: wg_ld_monotonic_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_ld_monotonic_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: wg_ld_monotonic_single64
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: liveins: $sgpr4_sgpr5
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-LABEL: name: wg_ld_monotonic_single64
; GFX12: bb.0 (%ir-block.0):
; GFX12-NEXT: liveins: $sgpr4_sgpr5
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
; GFX12-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_ld_monotonic_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_ENDPGM 0
%v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") monotonic, align 4
ret void
}
define amdgpu_kernel void @wg_st_seq_cst_single32(ptr addrspace(1) %p, i32 %x) #0 {
; GFX9-LABEL: name: wg_st_seq_cst_single32
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_st_seq_cst_single32
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: wg_st_seq_cst_single32
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: liveins: $sgpr4_sgpr5
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_st_seq_cst_single32
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_st_seq_cst_single32
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_st_seq_cst_single32
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_ENDPGM 0
store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
ret void
}
define amdgpu_kernel void @wg_st_seq_cst_single64(ptr addrspace(1) %p, i32 %x) #1 {
; GFX9-LABEL: name: wg_st_seq_cst_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_st_seq_cst_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_st_seq_cst_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_st_seq_cst_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_st_seq_cst_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_st_seq_cst_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_st_seq_cst_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_ENDPGM 0
store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
ret void
}
define amdgpu_kernel void @wg_st_seq_cst_multi(ptr addrspace(1) %p, i32 %x) #2 {
; GFX9-LABEL: name: wg_st_seq_cst_multi
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX9-NEXT: S_WAITCNT_lds_direct
; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_st_seq_cst_multi
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX942-NEXT: S_WAITCNT_lds_direct
; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 1, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: wg_st_seq_cst_multi
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: liveins: $sgpr4_sgpr5
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-NEXT: S_WAITCNT_lds_direct
; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_st_seq_cst_multi
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_st_seq_cst_multi
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_st_seq_cst_multi
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_ENDPGM 0
store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
ret void
}
define amdgpu_kernel void @wg_st_release_single64(ptr addrspace(1) %p, i32 %x) #1 {
; GFX9-LABEL: name: wg_st_release_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_st_release_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_st_release_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_st_release_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_st_release_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_st_release_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_st_release_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_ENDPGM 0
store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") release, align 4
ret void
}
define amdgpu_kernel void @wg_rmw_add_seq_cst_single32(ptr addrspace(1) %p) #0 {
; GFX9-LABEL: name: wg_rmw_add_seq_cst_single32
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.1 (%ir-block.11):
; GFX9-NEXT: successors: %bb.2(0x80000000)
; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX9-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.2 (%ir-block.16):
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_rmw_add_seq_cst_single32
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: bb.1 (%ir-block.11):
; GFX942-NEXT: successors: %bb.2(0x80000000)
; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX942-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: bb.2 (%ir-block.16):
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_single32
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: bb.1 (%ir-block.7):
; GFX10-W32-NEXT: successors: %bb.2(0x80000000)
; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: bb.2 (%ir-block.11):
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_single32
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: bb.1 (%ir-block.11):
; GFX10-W64-NEXT: successors: %bb.2(0x80000000)
; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: bb.2 (%ir-block.16):
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_single32
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: bb.1 (%ir-block.7):
; GFX12-W32-NEXT: successors: %bb.2(0x80000000)
; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: bb.2 (%ir-block.11):
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_single32
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: bb.1 (%ir-block.11):
; GFX12-W64-NEXT: successors: %bb.2(0x80000000)
; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX12-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: bb.2 (%ir-block.16):
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_rmw_add_seq_cst_single32
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: bb.1 (%ir-block.7):
; GFX1250-NEXT: successors: %bb.2(0x80000000)
; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: bb.2 (%ir-block.11):
; GFX1250-NEXT: S_ENDPGM 0
%r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @wg_rmw_add_seq_cst_single64(ptr addrspace(1) %p) #1 {
; GFX9-LABEL: name: wg_rmw_add_seq_cst_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.1 (%ir-block.11):
; GFX9-NEXT: successors: %bb.2(0x80000000)
; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX9-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.2 (%ir-block.16):
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_rmw_add_seq_cst_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: bb.1 (%ir-block.11):
; GFX942-NEXT: successors: %bb.2(0x80000000)
; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX942-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: bb.2 (%ir-block.16):
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: bb.1 (%ir-block.7):
; GFX10-W32-NEXT: successors: %bb.2(0x80000000)
; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: bb.2 (%ir-block.11):
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: bb.1 (%ir-block.11):
; GFX10-W64-NEXT: successors: %bb.2(0x80000000)
; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: bb.2 (%ir-block.16):
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: bb.1 (%ir-block.7):
; GFX12-W32-NEXT: successors: %bb.2(0x80000000)
; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: bb.2 (%ir-block.11):
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: bb.1 (%ir-block.11):
; GFX12-W64-NEXT: successors: %bb.2(0x80000000)
; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX12-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: bb.2 (%ir-block.16):
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_rmw_add_seq_cst_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: bb.1 (%ir-block.7):
; GFX1250-NEXT: successors: %bb.2(0x80000000)
; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: bb.2 (%ir-block.11):
; GFX1250-NEXT: S_ENDPGM 0
%r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @wg_rmw_add_seq_cst_multi(ptr addrspace(1) %p) #2 {
; GFX9-LABEL: name: wg_rmw_add_seq_cst_multi
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.1 (%ir-block.11):
; GFX9-NEXT: successors: %bb.2(0x80000000)
; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX9-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX9-NEXT: S_WAITCNT_lds_direct
; GFX9-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.2 (%ir-block.16):
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_rmw_add_seq_cst_multi
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: bb.1 (%ir-block.11):
; GFX942-NEXT: successors: %bb.2(0x80000000)
; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX942-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX942-NEXT: S_WAITCNT_lds_direct
; GFX942-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: bb.2 (%ir-block.16):
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_multi
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: bb.1 (%ir-block.7):
; GFX10-W32-NEXT: successors: %bb.2(0x80000000)
; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: bb.2 (%ir-block.11):
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_multi
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: bb.1 (%ir-block.11):
; GFX10-W64-NEXT: successors: %bb.2(0x80000000)
; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W64-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W64-NEXT: S_WAITCNT_lds_direct
; GFX10-W64-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX10-W64-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W64-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: bb.2 (%ir-block.16):
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_multi
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: bb.1 (%ir-block.7):
; GFX12-W32-NEXT: successors: %bb.2(0x80000000)
; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: bb.2 (%ir-block.11):
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_multi
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: bb.1 (%ir-block.11):
; GFX12-W64-NEXT: successors: %bb.2(0x80000000)
; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W64-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: bb.2 (%ir-block.16):
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_rmw_add_seq_cst_multi
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: bb.1 (%ir-block.7):
; GFX1250-NEXT: successors: %bb.2(0x80000000)
; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: bb.2 (%ir-block.11):
; GFX1250-NEXT: S_ENDPGM 0
%r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @wg_rmw_xchg_acq_rel_single64(ptr addrspace(1) %p, i32 %x) #1 {
; GFX9-LABEL: name: wg_rmw_xchg_acq_rel_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_rmw_xchg_acq_rel_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_rmw_xchg_acq_rel_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_rmw_xchg_acq_rel_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W64-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_rmw_xchg_acq_rel_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_rmw_xchg_acq_rel_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_rmw_xchg_acq_rel_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%r = atomicrmw xchg ptr addrspace(1) %p, i32 %x syncscope("workgroup") acq_rel
ret void
}
define amdgpu_kernel void @wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspace(1) %p, i32 %cmp, i32 %new) #1 {
; GFX9-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") acq_rel monotonic
ret void
}
define amdgpu_kernel void @wg_cmpxchg_seq_cst_seq_cst_multi(ptr addrspace(1) %p, i32 %cmp, i32 %new) #2 {
; GFX9-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX9-NEXT: S_WAITCNT_lds_direct
; GFX9-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX942-NEXT: S_WAITCNT_lds_direct
; GFX942-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: liveins: $sgpr4_sgpr5
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-NEXT: S_WAITCNT_lds_direct
; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W64-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") seq_cst seq_cst
ret void
}
define amdgpu_kernel void @wg_cmpxchg_acquire_acquire_single64(ptr addrspace(1) %p, i32 %cmp, i32 %new) #1 {
; GFX9-LABEL: name: wg_cmpxchg_acquire_acquire_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: wg_cmpxchg_acquire_acquire_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: wg_cmpxchg_acquire_acquire_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: wg_cmpxchg_acquire_acquire_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: wg_cmpxchg_acquire_acquire_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX12-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: wg_cmpxchg_acquire_acquire_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: wg_cmpxchg_acquire_acquire_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") acquire acquire
ret void
}
define amdgpu_kernel void @lds_wg_ld_seq_cst_single32(ptr addrspace(3) %p) #0 {
; GFX9-LABEL: name: lds_wg_ld_seq_cst_single32
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX9-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: lds_wg_ld_seq_cst_single32
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX942-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: lds_wg_ld_seq_cst_single32
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: liveins: $sgpr4_sgpr5
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-LABEL: name: lds_wg_ld_seq_cst_single32
; GFX12: bb.0 (%ir-block.0):
; GFX12-NEXT: liveins: $sgpr4_sgpr5
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX12-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX12-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX12-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: lds_wg_ld_seq_cst_single32
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX1250-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX1250-NEXT: S_ENDPGM 0
%v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
ret void
}
define amdgpu_kernel void @lds_wg_ld_seq_cst_single64(ptr addrspace(3) %p) #1 {
; GFX9-LABEL: name: lds_wg_ld_seq_cst_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX9-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: lds_wg_ld_seq_cst_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX942-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: lds_wg_ld_seq_cst_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX10-W32-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: lds_wg_ld_seq_cst_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W64-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: lds_wg_ld_seq_cst_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: lds_wg_ld_seq_cst_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX12-W64-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: lds_wg_ld_seq_cst_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
ret void
}
define amdgpu_kernel void @lds_wg_ld_seq_cst_multi(ptr addrspace(3) %p) #2 {
; GFX9-LABEL: name: lds_wg_ld_seq_cst_multi
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX9-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX9-NEXT: S_WAITCNT_lds_direct
; GFX9-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX9-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX9-NEXT: S_WAITCNT_lds_direct
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: lds_wg_ld_seq_cst_multi
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX942-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX942-NEXT: S_WAITCNT_lds_direct
; GFX942-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX942-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX942-NEXT: S_WAITCNT_lds_direct
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: lds_wg_ld_seq_cst_multi
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: liveins: $sgpr4_sgpr5
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-NEXT: S_WAITCNT_lds_direct
; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX10-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX10-NEXT: S_WAITCNT_lds_direct
; GFX10-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-LABEL: name: lds_wg_ld_seq_cst_multi
; GFX12: bb.0 (%ir-block.0):
; GFX12-NEXT: liveins: $sgpr4_sgpr5
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX12-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX12-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: lds_wg_ld_seq_cst_multi
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
ret void
}
define amdgpu_kernel void @lds_wg_st_release_single64(ptr addrspace(3) %p, i32 %x) #1 {
; GFX9-LABEL: name: lds_wg_st_release_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX9-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: lds_wg_st_release_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX942-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: lds_wg_st_release_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: lds_wg_st_release_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX10-W64-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: lds_wg_st_release_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: lds_wg_st_release_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX12-W64-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: lds_wg_st_release_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
; GFX1250-NEXT: S_ENDPGM 0
store atomic i32 %x, ptr addrspace(3) %p syncscope("workgroup") release, align 4
ret void
}
define amdgpu_kernel void @lds_wg_rmw_add_acq_rel_single64(ptr addrspace(3) %p) #1 {
; GFX9-LABEL: name: lds_wg_rmw_add_acq_rel_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.1 (%ir-block.11):
; GFX9-NEXT: successors: %bb.2(0x80000000)
; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.2 (%ir-block.16):
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: lds_wg_rmw_add_acq_rel_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: bb.1 (%ir-block.11):
; GFX942-NEXT: successors: %bb.2(0x80000000)
; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: bb.2 (%ir-block.16):
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: lds_wg_rmw_add_acq_rel_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: bb.1 (%ir-block.7):
; GFX10-W32-NEXT: successors: %bb.2(0x80000000)
; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
; GFX10-W32-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: bb.2 (%ir-block.11):
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: lds_wg_rmw_add_acq_rel_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: bb.1 (%ir-block.11):
; GFX10-W64-NEXT: successors: %bb.2(0x80000000)
; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W64-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: bb.2 (%ir-block.16):
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: lds_wg_rmw_add_acq_rel_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: bb.1 (%ir-block.7):
; GFX12-W32-NEXT: successors: %bb.2(0x80000000)
; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
; GFX12-W32-NEXT: $vgpr1, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: bb.2 (%ir-block.11):
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: lds_wg_rmw_add_acq_rel_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: bb.1 (%ir-block.11):
; GFX12-W64-NEXT: successors: %bb.2(0x80000000)
; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: bb.2 (%ir-block.16):
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: lds_wg_rmw_add_acq_rel_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: bb.1 (%ir-block.7):
; GFX1250-NEXT: successors: %bb.2(0x80000000)
; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
; GFX1250-NEXT: $vgpr1, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: bb.2 (%ir-block.11):
; GFX1250-NEXT: S_ENDPGM 0
%r = atomicrmw add ptr addrspace(3) %p, i32 3 syncscope("workgroup") acq_rel
ret void
}
define amdgpu_kernel void @lds_wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspace(3) %p, i32 %cmp, i32 %new) #1 {
; GFX9-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX9-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX10-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-W32-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
; GFX10-W32-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX10-W32-NEXT: S_WAITCNT_lds_direct
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX10-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W64-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX12-W64-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%r = cmpxchg ptr addrspace(3) %p, i32 %cmp, i32 %new syncscope("workgroup") acq_rel monotonic
ret void
}
define amdgpu_kernel void @lds_wg_ld_unordered_single64(ptr addrspace(3) %p) #1 {
; GFX9-LABEL: name: lds_wg_ld_unordered_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: lds_wg_ld_unordered_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: lds_wg_ld_unordered_single64
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-LABEL: name: lds_wg_ld_unordered_single64
; GFX12: bb.0 (%ir-block.0):
; GFX12-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: lds_wg_ld_unordered_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: S_ENDPGM 0
%v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") unordered, align 4
ret void
}
define amdgpu_kernel void @lds_wg_cmpxchg_monotonic_acquire_single64(ptr addrspace(3) %p, i32 %cmp, i32 %new) #1 {
; GFX9-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX9-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX10-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W32-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
; GFX10-W32-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX10-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-W64-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX12-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX12-W32-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX12-W64-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
; GFX1250-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%r = cmpxchg ptr addrspace(3) %p, i32 %cmp, i32 %new syncscope("workgroup") monotonic acquire
ret void
}
define amdgpu_kernel void @flat_wg_ld_acquire_single64(ptr addrspace(0) %p) #1 {
; GFX9-LABEL: name: flat_wg_ld_acquire_single64
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
; GFX9-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: flat_wg_ld_acquire_single64
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr0_sgpr1, implicit $exec, implicit $exec
; GFX942-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-W32-LABEL: name: flat_wg_ld_acquire_single64
; GFX10-W32: bb.0 (%ir-block.0):
; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W32-NEXT: {{ $}}
; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
; GFX10-W32-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 1, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
; GFX10-W32-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10-W32-NEXT: S_ENDPGM 0
;
; GFX10-W64-LABEL: name: flat_wg_ld_acquire_single64
; GFX10-W64: bb.0 (%ir-block.0):
; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX10-W64-NEXT: {{ $}}
; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
; GFX10-W64-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
; GFX10-W64-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: flat_wg_ld_acquire_single64
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
; GFX12-W32-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 8, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: flat_wg_ld_acquire_single64
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
; GFX12-W64-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: flat_wg_ld_acquire_single64
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX1250-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_ENDPGM 0
%v = load atomic i32, ptr addrspace(0) %p syncscope("workgroup") acquire, align 4
ret void
}
define amdgpu_kernel void @flat_wg_st_seq_cst_multi(ptr addrspace(0) %p, i32 %x) #2 {
; GFX9-LABEL: name: flat_wg_st_seq_cst_multi
; GFX9: bb.0 (%ir-block.0):
; GFX9-NEXT: liveins: $sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
; GFX9-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX9-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX9-NEXT: S_WAITCNT_lds_direct
; GFX9-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
; GFX9-NEXT: S_ENDPGM 0
;
; GFX942-LABEL: name: flat_wg_st_seq_cst_multi
; GFX942: bb.0 (%ir-block.0):
; GFX942-NEXT: liveins: $sgpr4_sgpr5
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr0_sgpr1, implicit $exec, implicit $exec
; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX942-NEXT: S_WAITCNT_soft .Lgkmcnt_0
; GFX942-NEXT: S_WAITCNT_lds_direct
; GFX942-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 1, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX10-LABEL: name: flat_wg_st_seq_cst_multi
; GFX10: bb.0 (%ir-block.0):
; GFX10-NEXT: liveins: $sgpr4_sgpr5
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX10-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX10-NEXT: S_WAITCNT_soft .Vmcnt_0_Lgkmcnt_0
; GFX10-NEXT: S_WAITCNT_lds_direct
; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
; GFX10-NEXT: S_ENDPGM 0
;
; GFX12-W32-LABEL: name: flat_wg_st_seq_cst_multi
; GFX12-W32: bb.0 (%ir-block.0):
; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W32-NEXT: {{ $}}
; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
; GFX12-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W32-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 8, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
; GFX12-W32-NEXT: S_ENDPGM 0
;
; GFX12-W64-LABEL: name: flat_wg_st_seq_cst_multi
; GFX12-W64: bb.0 (%ir-block.0):
; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
; GFX12-W64-NEXT: {{ $}}
; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
; GFX12-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0
; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0
; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0
; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0
; GFX12-W64-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 8, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
; GFX12-W64-NEXT: S_ENDPGM 0
;
; GFX1250-LABEL: name: flat_wg_st_seq_cst_multi
; GFX1250: bb.0 (%ir-block.0):
; GFX1250-NEXT: liveins: $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
; GFX1250-NEXT: S_WAIT_XCNT_soft 0
; GFX1250-NEXT: FLAT_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
; GFX1250-NEXT: S_ENDPGM 0
store atomic i32 %x, ptr addrspace(0) %p syncscope("workgroup") seq_cst, align 4
ret void
}
attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }