blob: e3adf737d6e5c440b0f11051f103040008296eb2 [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
; Test single atomic RMW - s_wait_xcnt should be kept.
define amdgpu_kernel void @single_atomic_rmw(ptr addrspace(1) %ptr) {
; GFX1250-LABEL: single_atomic_rmw:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
ret void
}
; Test back-to-back atomic RMW operations - only first s_wait_xcnt should remain.
define amdgpu_kernel void @atomic_rmw_back_to_back(ptr addrspace(1) %ptr) {
; GFX1250-LABEL: atomic_rmw_back_to_back:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 3
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst
ret void
}
; Test atomic RMW block with interleaved ALU ops - should not break the block.
define amdgpu_kernel void @atomic_rmw_with_alu(ptr addrspace(1) %ptr, i32 %a, i32 %b) {
; GFX1250-LABEL: atomic_rmw_with_alu:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0xc
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_add_co_i32 s4, s2, s3
; GFX1250-NEXT: v_mov_b32_e32 v1, s4
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_mul_i32 s2, s2, s3
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%sum = add i32 %a, %b
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 %sum seq_cst
%prod = mul i32 %a, %b
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %prod seq_cst
ret void
}
; Test atomic RMW block broken by global load (VMEM).
define amdgpu_kernel void @atomic_rmw_broken_by_global_load(ptr addrspace(1) %ptr, ptr addrspace(1) %load_ptr) {
; GFX1250-LABEL: atomic_rmw_broken_by_global_load:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
%load = load i32, ptr addrspace(1) %load_ptr
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst
ret void
}
; Test atomic RMW block broken by global store (VMEM).
define amdgpu_kernel void @atomic_rmw_broken_by_global_store(ptr addrspace(1) %ptr, ptr addrspace(1) %store_ptr) {
; GFX1250-LABEL: atomic_rmw_broken_by_global_store:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 42
; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 3
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
store i32 42, ptr addrspace(1) %store_ptr
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst
ret void
}
; Test atomic RMW block broken by FLAT load (VMEM).
define amdgpu_kernel void @atomic_rmw_broken_by_flat_load(ptr addrspace(1) %ptr, ptr %flat_ptr) {
; GFX1250-LABEL: atomic_rmw_broken_by_flat_load:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
%load = load i32, ptr %flat_ptr, align 4
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst
ret void
}
; Test atomic RMW block broken by FLAT store (VMEM).
define amdgpu_kernel void @atomic_rmw_broken_by_flat_store(ptr addrspace(1) %ptr, ptr %flat_ptr) {
; GFX1250-LABEL: atomic_rmw_broken_by_flat_store:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 42
; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3]
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 3
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
store i32 42, ptr %flat_ptr, align 4
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst
ret void
}
; Test atomic RMW block broken by SMEM load.
define amdgpu_kernel void @atomic_rmw_broken_by_smem_load(ptr addrspace(1) %ptr, ptr addrspace(4) %const_ptr) {
; GFX1250-LABEL: atomic_rmw_broken_by_smem_load:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
%load = load i32, ptr addrspace(4) %const_ptr
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst
ret void
}
; Test atomic RMW block broken by atomic store.
define amdgpu_kernel void @atomic_rmw_broken_by_atomic_store(ptr addrspace(1) %ptr, ptr addrspace(1) %store_ptr) {
; GFX1250-LABEL: atomic_rmw_broken_by_atomic_store:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 42
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 3
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
store atomic i32 42, ptr addrspace(1) %store_ptr seq_cst, align 4
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst
ret void
}
; Test LDS load should not break atomic RMW block.
define amdgpu_kernel void @atomic_rmw_with_lds_load(ptr addrspace(1) %ptr, ptr addrspace(3) %lds_ptr) {
; GFX1250-LABEL: atomic_rmw_with_lds_load:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: ds_load_b32 v1, v1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
%load = load i32, ptr addrspace(3) %lds_ptr, align 4
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst
ret void
}
; Test LDS store should not break atomic RMW block.
define amdgpu_kernel void @atomic_rmw_with_lds_store(ptr addrspace(1) %ptr, ptr addrspace(3) %lds_ptr) {
; GFX1250-LABEL: atomic_rmw_with_lds_store:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v2, 42
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: ds_store_b32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v1, 3
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
store i32 42, ptr addrspace(3) %lds_ptr, align 4
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst
ret void
}
; Test FLAT load from LDS should not break atomic RMW block.
define amdgpu_kernel void @atomic_rmw_with_flat_lds_load(ptr addrspace(1) %ptr, ptr addrspace(3) %lds_ptr) {
; GFX1250-LABEL: atomic_rmw_with_flat_lds_load:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_mov_b64 s[6:7], 0
; GFX1250-NEXT: s_mov_b32 s4, s7
; GFX1250-NEXT: s_mov_b64 s[8:9], src_shared_base
; GFX1250-NEXT: s_mov_b32 s3, s9
; GFX1250-NEXT: s_mov_b32 s5, -1
; GFX1250-NEXT: s_cmp_lg_u32 s2, s5
; GFX1250-NEXT: s_cselect_b32 s4, s3, s4
; GFX1250-NEXT: s_mov_b32 s3, s6
; GFX1250-NEXT: s_cselect_b32 s2, s2, s3
; GFX1250-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GFX1250-NEXT: s_mov_b32 s3, s4
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
%flat_lds = addrspacecast ptr addrspace(3) %lds_ptr to ptr
%load = load i32, ptr %flat_lds, align 4
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst
ret void
}
; Test FLAT store to LDS should not break atomic RMW block.
define amdgpu_kernel void @atomic_rmw_with_flat_lds_store(ptr addrspace(1) %ptr, ptr addrspace(3) %lds_ptr) {
; GFX1250-LABEL: atomic_rmw_with_flat_lds_store:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_mov_b64 s[6:7], 0
; GFX1250-NEXT: s_mov_b32 s4, s7
; GFX1250-NEXT: s_mov_b64 s[8:9], src_shared_base
; GFX1250-NEXT: s_mov_b32 s3, s9
; GFX1250-NEXT: s_mov_b32 s5, -1
; GFX1250-NEXT: s_cmp_lg_u32 s2, s5
; GFX1250-NEXT: s_cselect_b32 s4, s3, s4
; GFX1250-NEXT: s_mov_b32 s3, s6
; GFX1250-NEXT: s_cselect_b32 s2, s2, s3
; GFX1250-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GFX1250-NEXT: s_mov_b32 s3, s4
; GFX1250-NEXT: v_mov_b32_e32 v1, 42
; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3]
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 3
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
%flat_lds = addrspacecast ptr addrspace(3) %lds_ptr to ptr
store i32 42, ptr %flat_lds, align 4
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst
ret void
}
; Test atomic RMW block broken by async copy from global to LDS.
define amdgpu_kernel void @atomic_rmw_borken_by_async_lds_copy(ptr addrspace(1) %ptr, ptr addrspace(1) %src, ptr addrspace(3) %dst) {
; GFX1250-LABEL: atomic_rmw_borken_by_async_lds_copy:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b64 s[2:3], s[4:5]
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[2:3], 0x8
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x10
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_b32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v1, 3
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
%load = load i32, ptr addrspace(1) %src, align 4
store i32 %load, ptr addrspace(3) %dst, align 4
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst
ret void
}
; Test multiple separate atomic RMW blocks.
define amdgpu_kernel void @multiple_atomic_rmw_blocks(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) {
; First block
; GFX1250-LABEL: multiple_atomic_rmw_blocks:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 4
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:12 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr1, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr1, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr1, i64 2
%gep4 = getelementptr i32, ptr addrspace(1) %ptr1, i64 3
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
%load = load i32, ptr addrspace(1) %ptr2
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst
%val4 = atomicrmw add ptr addrspace(1) %gep4, i32 4 seq_cst
ret void
}
; Test different atomic RMW operations in a block.
define amdgpu_kernel void @different_atomic_ops(ptr addrspace(1) %ptr) {
; GFX1250-LABEL: different_atomic_ops:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_sub_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 3
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_and_b32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 4
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_or_b32 v0, v1, s[0:1] offset:12 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%gep4 = getelementptr i32, ptr addrspace(1) %ptr, i64 3
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw sub ptr addrspace(1) %gep2, i32 2 seq_cst
%val3 = atomicrmw and ptr addrspace(1) %gep3, i32 3 seq_cst
%val4 = atomicrmw or ptr addrspace(1) %gep4, i32 4 seq_cst
ret void
}
; Test atomic RMW block reset at basic block boundary.
define amdgpu_kernel void @atomic_rmw_across_basic_blocks(ptr addrspace(1) %ptr, i32 %cond) {
; GFX1250-LABEL: atomic_rmw_across_basic_blocks:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
; GFX1250-NEXT: v_writelane_b32 v2, s4, 0
; GFX1250-NEXT: v_writelane_b32 v2, s5, 1
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
; GFX1250-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_mov_b32 s1, 0
; GFX1250-NEXT: s_cmp_lg_u32 s0, s1
; GFX1250-NEXT: s_cbranch_scc1 .LBB16_2
; GFX1250-NEXT: ; %bb.1: ; %then
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
; GFX1250-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_readlane_b32 s0, v2, 0
; GFX1250-NEXT: v_readlane_b32 s1, v2, 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, 3
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: .LBB16_2: ; %exit
; GFX1250-NEXT: s_endpgm
entry:
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
%cmp = icmp eq i32 %cond, 0
br i1 %cmp, label %then, label %exit
then:
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst
br label %exit
exit:
ret void
}
; Test atomic RMW block in loop.
define amdgpu_kernel void @atomic_rmw_in_loop(ptr addrspace(1) %ptr, i32 %n) {
; GFX1250-LABEL: atomic_rmw_in_loop:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_writelane_b32 v2, s2, 0
; GFX1250-NEXT: v_writelane_b32 v2, s3, 1
; GFX1250-NEXT: s_mov_b32 s0, 0
; GFX1250-NEXT: v_writelane_b32 v2, s1, 2
; GFX1250-NEXT: v_writelane_b32 v2, s0, 3
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
; GFX1250-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: .LBB17_1: ; %loop
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
; GFX1250-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_readlane_b32 s0, v2, 3
; GFX1250-NEXT: v_readlane_b32 s1, v2, 2
; GFX1250-NEXT: v_readlane_b32 s2, v2, 0
; GFX1250-NEXT: v_readlane_b32 s3, v2, 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, s0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, s0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_mov_b32 s2, 1
; GFX1250-NEXT: s_add_co_i32 s0, s0, s2
; GFX1250-NEXT: s_cmp_lt_u32 s0, s1
; GFX1250-NEXT: v_writelane_b32 v2, s0, 3
; GFX1250-NEXT: s_mov_b32 s6, exec_lo
; GFX1250-NEXT: s_mov_b32 exec_lo, -1
; GFX1250-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1250-NEXT: ; %bb.2: ; %exit
; GFX1250-NEXT: s_endpgm
entry:
br label %loop
loop:
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 %i seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 %i seq_cst
%i.next = add i32 %i, 1
%cmp = icmp ult i32 %i.next, %n
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; Test atomic RMW block with branch in between - state reset at new block.
define amdgpu_kernel void @atomic_rmw_with_branch(ptr addrspace(1) %ptr, i32 %cond) {
; GFX1250-LABEL: atomic_rmw_with_branch:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
; GFX1250-NEXT: v_writelane_b32 v2, s4, 0
; GFX1250-NEXT: v_writelane_b32 v2, s5, 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_mov_b32 s0, -1
; GFX1250-NEXT: s_mov_b32 s2, 0
; GFX1250-NEXT: s_cmp_lg_u32 s1, s2
; GFX1250-NEXT: v_writelane_b32 v2, s0, 2
; GFX1250-NEXT: s_mov_b32 s6, exec_lo
; GFX1250-NEXT: s_mov_b32 exec_lo, -1
; GFX1250-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: s_cbranch_scc1 .LBB18_3
; GFX1250-NEXT: .LBB18_1: ; %Flow
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
; GFX1250-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_readlane_b32 s0, v2, 2
; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX1250-NEXT: s_mov_b32 s0, 1
; GFX1250-NEXT: v_cmp_ne_u32_e64 s0, v0, s0
; GFX1250-NEXT: s_and_b32 vcc_lo, exec_lo, s0
; GFX1250-NEXT: s_cbranch_vccnz .LBB18_4
; GFX1250-NEXT: ; %bb.2: ; %bb1
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
; GFX1250-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_readlane_b32 s0, v2, 0
; GFX1250-NEXT: v_readlane_b32 s1, v2, 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, 3
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_branch .LBB18_4
; GFX1250-NEXT: .LBB18_3: ; %bb2
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
; GFX1250-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_readlane_b32 s0, v2, 0
; GFX1250-NEXT: v_readlane_b32 s1, v2, 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, 4
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:12 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_mov_b32 s0, 0
; GFX1250-NEXT: v_writelane_b32 v2, s0, 2
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
; GFX1250-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: s_branch .LBB18_1
; GFX1250-NEXT: .LBB18_4: ; %merge
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
; GFX1250-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_readlane_b32 s0, v2, 0
; GFX1250-NEXT: v_readlane_b32 s1, v2, 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, 5
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:16 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
entry:
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
%cmp = icmp eq i32 %cond, 0
br i1 %cmp, label %bb1, label %bb2
bb1:
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst
br label %merge
bb2:
%gep4 = getelementptr i32, ptr addrspace(1) %ptr, i64 3
%val4 = atomicrmw add ptr addrspace(1) %gep4, i32 4 seq_cst
br label %merge
merge:
%gep5 = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val5 = atomicrmw add ptr addrspace(1) %gep5, i32 5 seq_cst
ret void
}
; Test fall-through block.
define amdgpu_kernel void @atomic_rmw_fallthrough(ptr addrspace(1) %ptr) {
; GFX1250-LABEL: atomic_rmw_fallthrough:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_mov_b64 s[2:3], s[0:1]
; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
; GFX1250-NEXT: v_writelane_b32 v2, s2, 0
; GFX1250-NEXT: v_writelane_b32 v2, s3, 1
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
; GFX1250-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; %bb.1: ; %next
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
; GFX1250-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_readlane_b32 s0, v2, 0
; GFX1250-NEXT: v_readlane_b32 s1, v2, 1
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, 3
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 4
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:12 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_endpgm
entry:
%gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
%gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
%val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst
%val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst
br label %next
next:
%gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
%gep4 = getelementptr i32, ptr addrspace(1) %ptr, i64 3
%val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst
%val4 = atomicrmw add ptr addrspace(1) %gep4, i32 4 seq_cst
ret void
}