blob: 003aa049b2d1bddf5f648066d0aa5dabac980fb9 [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=CHECK,GFX950 %s
;---------------------------------------------------------------------
; xchg i32 cases
;---------------------------------------------------------------------
; Input and result use AGPR
define void @flat_atomic_xchg_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
call void asm "; use $0", "a"(i32 %result)
ret void
}
; Input is AGPR, result used as VGPR.
define void @flat_atomic_xchg_i32_ret_a_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
call void asm "; use $0", "v"(i32 %result)
ret void
}
; Input is VGPR, result used as AGPR
define void @flat_atomic_xchg_i32_ret_v_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_v_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_ret_v_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=v"()
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
call void asm "; use $0", "a"(i32 %result)
ret void
}
; Input is AV, result also used as AV
define void @flat_atomic_xchg_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
; Input is AV, used as v
define void @flat_atomic_xchg_i32_ret_av_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
call void asm "; use $0", "v"(i32 %result)
ret void
}
; Input is AV, used as a
define void @flat_atomic_xchg_i32_ret_av_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
call void asm "; use $0", "a"(i32 %result)
ret void
}
; Input is a, result used as AV
define void @flat_atomic_xchg_i32_ret_a_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
; Input is v, result used as AV
define void @flat_atomic_xchg_i32_ret_v_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_v_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_ret_v_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=v"()
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:31]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:31]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
%vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
%vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_xchg_i32_noret_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_noret_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap v[0:1], a0 offset:40
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_noret_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap v[0:1], a0 offset:40 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%unused = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
ret void
}
define void @flat_atomic_xchg_i32_noret_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_noret_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap v[0:1], v2 offset:40
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_noret_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap v[0:1], v2 offset:40 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%unused = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
ret void
}
;---------------------------------------------------------------------
; xchg i64 cases
;---------------------------------------------------------------------
; Input and result use AGPR
define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB11_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB11_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB11_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB11_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
; GFX950-NEXT: v_accvgpr_read_b32 v3, a3
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB11_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $agpr2_agpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB11_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB11_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 v0, a[2:3], off
; GFX950-NEXT: .LBB11_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
call void asm "; use $0", "a"(i64 %result)
ret void
}
; Input is AGPR, result used as VGPR.
define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB12_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB12_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB12_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB12_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB12_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB12_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB12_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off
; GFX950-NEXT: .LBB12_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
call void asm "; use $0", "v"(i64 %result)
ret void
}
; Input is VGPR, result used as AGPR
define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB13_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB13_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB13_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB13_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB13_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB13_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB13_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
; GFX950-NEXT: .LBB13_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=v"()
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
call void asm "; use $0", "a"(i64 %result)
ret void
}
; Input is AV, result also used as AV
define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB14_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB14_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB14_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB14_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB14_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB14_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
; GFX950-NEXT: .LBB14_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
; Input is AV, used as v
define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB15_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB15_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB15_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB15_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB15_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB15_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
; GFX950-NEXT: .LBB15_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
call void asm "; use $0", "v"(i64 %result)
ret void
}
; Input is AV, used as a
define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB16_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB16_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB16_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB16_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB16_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB16_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
; GFX950-NEXT: .LBB16_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
call void asm "; use $0", "a"(i64 %result)
ret void
}
; Input is a, result used as AV
define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB17_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB17_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB17_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB17_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB17_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB17_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off
; GFX950-NEXT: .LBB17_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
; Input is v, result used as AV
define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB18_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB18_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB18_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB18_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB18_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB18_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
; GFX950-NEXT: .LBB18_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=v"()
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_noret_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_3
; GFX90A-NEXT: ; %bb.1: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_4
; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], a[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB19_2
; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_noret_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB19_3
; GFX950-NEXT: ; %bb.1: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB19_4
; GFX950-NEXT: .LBB19_2: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
; GFX950-NEXT: .LBB19_3: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], a[0:1] sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB19_2
; GFX950-NEXT: .LBB19_4: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v0, a[0:1], off
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%unused = atomicrmw xchg ptr %ptr, i64 %data seq_cst
ret void
}
define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_noret_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_3
; GFX90A-NEXT: ; %bb.1: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_4
; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB20_2
; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_noret_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB20_3
; GFX950-NEXT: ; %bb.1: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB20_4
; GFX950-NEXT: .LBB20_2: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
; GFX950-NEXT: .LBB20_3: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB20_2
; GFX950-NEXT: .LBB20_4: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%unused = atomicrmw xchg ptr %ptr, i64 %data seq_cst
ret void
}
;---------------------------------------------------------------------
; xor i32 cases with cmpxchg expansion
;---------------------------------------------------------------------
; Input and result use AGPR
define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB21_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
call void asm "; use $0", "a"(i32 %result)
ret void
}
; Input is AGPR, result used as VGPR.
define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
; GFX950-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB22_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
call void asm "; use $0", "v"(i32 %result)
ret void
}
; Input is VGPR, result used as AGPR
define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_v_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_v_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB23_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=v"()
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
call void asm "; use $0", "a"(i32 %result)
ret void
}
; Input is AV, result also used as AV
define void @flat_atomic_xor_expansion_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB24_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
; Input is AV, used as v
define void @flat_atomic_xor_expansion_i32_ret_av_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB25_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
call void asm "; use $0", "v"(i32 %result)
ret void
}
; Input is AV, used as a
define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB26_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
call void asm "; use $0", "a"(i32 %result)
ret void
}
; Input is a, result used as AV
define void @flat_atomic_xor_expansion_i32_ret_a_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
; GFX950-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB27_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
; Input is v, result used as AV
define void @flat_atomic_xor_expansion_i32_ret_v_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_v_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_v_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB28_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=v"()
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: v_accvgpr_write_b32 a33, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5
; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6
; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7
; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8
; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13
; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14
; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18
; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19
; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20
; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21
; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22
; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23
; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24
; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25
; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26
; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27
; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a32
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a33
; GFX90A-NEXT: flat_load_dword v1, v[4:5]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v0
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v1, v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v1
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:31]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a32
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:68 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:64 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:60 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:56 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:52 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:48 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:44 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:40 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:36 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:32 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:28 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:24 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:20 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:16 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:12 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:8 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:4 ; 4-byte Folded Spill
; GFX950-NEXT: scratch_store_dword off, a33, s32 ; 4-byte Folded Spill
; GFX950-NEXT: v_accvgpr_write_b32 a33, v1
; GFX950-NEXT: v_accvgpr_write_b32 a32, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_accvgpr_write_b32 a2, v2
; GFX950-NEXT: v_accvgpr_write_b32 a3, v3
; GFX950-NEXT: v_accvgpr_write_b32 a4, v4
; GFX950-NEXT: v_accvgpr_write_b32 a5, v5
; GFX950-NEXT: v_accvgpr_write_b32 a6, v6
; GFX950-NEXT: v_accvgpr_write_b32 a7, v7
; GFX950-NEXT: v_accvgpr_write_b32 a8, v8
; GFX950-NEXT: v_accvgpr_write_b32 a9, v9
; GFX950-NEXT: v_accvgpr_write_b32 a10, v10
; GFX950-NEXT: v_accvgpr_write_b32 a11, v11
; GFX950-NEXT: v_accvgpr_write_b32 a12, v12
; GFX950-NEXT: v_accvgpr_write_b32 a13, v13
; GFX950-NEXT: v_accvgpr_write_b32 a14, v14
; GFX950-NEXT: v_accvgpr_write_b32 a15, v15
; GFX950-NEXT: v_accvgpr_write_b32 a16, v16
; GFX950-NEXT: v_accvgpr_write_b32 a17, v17
; GFX950-NEXT: v_accvgpr_write_b32 a18, v18
; GFX950-NEXT: v_accvgpr_write_b32 a19, v19
; GFX950-NEXT: v_accvgpr_write_b32 a20, v20
; GFX950-NEXT: v_accvgpr_write_b32 a21, v21
; GFX950-NEXT: v_accvgpr_write_b32 a22, v22
; GFX950-NEXT: v_accvgpr_write_b32 a23, v23
; GFX950-NEXT: v_accvgpr_write_b32 a24, v24
; GFX950-NEXT: v_accvgpr_write_b32 a25, v25
; GFX950-NEXT: v_accvgpr_write_b32 a26, v26
; GFX950-NEXT: v_accvgpr_write_b32 a27, v27
; GFX950-NEXT: v_accvgpr_write_b32 a28, v28
; GFX950-NEXT: v_accvgpr_write_b32 a29, v29
; GFX950-NEXT: v_accvgpr_write_b32 a30, v30
; GFX950-NEXT: v_accvgpr_write_b32 a31, v31
; GFX950-NEXT: v_accvgpr_read_b32 v4, a32
; GFX950-NEXT: v_accvgpr_read_b32 v5, a33
; GFX950-NEXT: flat_load_dword v1, v[4:5]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v3, v1
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v0
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap v1, v[4:5], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB29_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a32, v1
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
; GFX950-NEXT: v_accvgpr_read_b32 v3, a3
; GFX950-NEXT: v_accvgpr_read_b32 v4, a4
; GFX950-NEXT: v_accvgpr_read_b32 v5, a5
; GFX950-NEXT: v_accvgpr_read_b32 v6, a6
; GFX950-NEXT: v_accvgpr_read_b32 v7, a7
; GFX950-NEXT: v_accvgpr_read_b32 v8, a8
; GFX950-NEXT: v_accvgpr_read_b32 v9, a9
; GFX950-NEXT: v_accvgpr_read_b32 v10, a10
; GFX950-NEXT: v_accvgpr_read_b32 v11, a11
; GFX950-NEXT: v_accvgpr_read_b32 v12, a12
; GFX950-NEXT: v_accvgpr_read_b32 v13, a13
; GFX950-NEXT: v_accvgpr_read_b32 v14, a14
; GFX950-NEXT: v_accvgpr_read_b32 v15, a15
; GFX950-NEXT: v_accvgpr_read_b32 v16, a16
; GFX950-NEXT: v_accvgpr_read_b32 v17, a17
; GFX950-NEXT: v_accvgpr_read_b32 v18, a18
; GFX950-NEXT: v_accvgpr_read_b32 v19, a19
; GFX950-NEXT: v_accvgpr_read_b32 v20, a20
; GFX950-NEXT: v_accvgpr_read_b32 v21, a21
; GFX950-NEXT: v_accvgpr_read_b32 v22, a22
; GFX950-NEXT: v_accvgpr_read_b32 v23, a23
; GFX950-NEXT: v_accvgpr_read_b32 v24, a24
; GFX950-NEXT: v_accvgpr_read_b32 v25, a25
; GFX950-NEXT: v_accvgpr_read_b32 v26, a26
; GFX950-NEXT: v_accvgpr_read_b32 v27, a27
; GFX950-NEXT: v_accvgpr_read_b32 v28, a28
; GFX950-NEXT: v_accvgpr_read_b32 v29, a29
; GFX950-NEXT: v_accvgpr_read_b32 v30, a30
; GFX950-NEXT: v_accvgpr_read_b32 v31, a31
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:31]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a32
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: scratch_load_dword a33, off, s32 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:4 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:8 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:12 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:16 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:20 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:24 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:28 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:32 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:36 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:44 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:52 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:60 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 ; 4-byte Folded Reload
; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:68 ; 4-byte Folded Reload
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
%vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
%vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_xor_expansion_i32_noret_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_noret_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_noret_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB30_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%unused = atomicrmw xor ptr %ptr, i32 %data seq_cst
ret void
}
define void @flat_atomic_xor_expansion_i32_noret_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_noret_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB31_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_noret_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB31_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%unused = atomicrmw xor ptr %ptr, i32 %data seq_cst
ret void
}
;---------------------------------------------------------------------
; xor i64 cases with cmpxchg expansion
;---------------------------------------------------------------------
; Input and result use AGPR
define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB32_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB32_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB32_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB32_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB32_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB32_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB32_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB32_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB32_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB32_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB32_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
call void asm "; use $0", "a"(i64 %result)
ret void
}
; Input is AGPR, result used as VGPR.
define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB33_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB33_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB33_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB33_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB33_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB33_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB33_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB33_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB33_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB33_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB33_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3
; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB33_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
call void asm "; use $0", "v"(i64 %result)
ret void
}
; Input is VGPR, result used as AGPR
define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_v_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB34_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB34_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB34_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB34_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB34_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_v_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB34_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB34_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB34_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB34_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB34_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB34_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=v"()
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
call void asm "; use $0", "a"(i64 %result)
ret void
}
; Input is AV, result also used as AV
define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB35_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB35_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB35_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB35_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB35_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB35_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB35_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB35_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB35_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB35_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB35_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3
; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB35_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
; Input is AV, used as v
define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB36_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB36_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB36_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB36_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB36_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB36_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_av_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB36_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB36_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB36_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB36_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB36_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3
; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB36_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
call void asm "; use $0", "v"(i64 %result)
ret void
}
; Input is AV, used as a
define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_av_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB37_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB37_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB37_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB37_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB37_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_av_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB37_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB37_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB37_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB37_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB37_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB37_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
call void asm "; use $0", "a"(i64 %result)
ret void
}
; Input is a, result used as AV
define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB38_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB38_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB38_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB38_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB38_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB38_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB38_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB38_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB38_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB38_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB38_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3
; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB38_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
; Input is v, result used as AV
define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_v_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB39_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB39_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB39_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB39_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_v_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB39_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB39_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB39_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB39_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB39_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3
; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB39_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=v"()
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_noret_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB40_3
; GFX90A-NEXT: ; %bb.1: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB40_6
; GFX90A-NEXT: .LBB40_2: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
; GFX90A-NEXT: .LBB40_3: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB40_4: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB40_4
; GFX90A-NEXT: ; %bb.5: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB40_2
; GFX90A-NEXT: .LBB40_6: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v6
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i64_noret_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB40_3
; GFX950-NEXT: ; %bb.1: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB40_6
; GFX950-NEXT: .LBB40_2: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
; GFX950-NEXT: .LBB40_3: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB40_4: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB40_4
; GFX950-NEXT: ; %bb.5: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB40_2
; GFX950-NEXT: .LBB40_6: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v1, v7
; GFX950-NEXT: v_xor_b32_e32 v0, v0, v6
; GFX950-NEXT: scratch_store_dwordx2 v2, v[0:1], off
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%unused = atomicrmw xor ptr %ptr, i64 %data seq_cst
ret void
}
define void @flat_atomic_xor_expansion_i64_noret_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_noret_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB41_3
; GFX90A-NEXT: ; %bb.1: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB41_6
; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
; GFX90A-NEXT: .LBB41_3: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB41_4: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB41_4
; GFX90A-NEXT: ; %bb.5: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB41_2
; GFX90A-NEXT: .LBB41_6: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v6
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i64_noret_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB41_3
; GFX950-NEXT: ; %bb.1: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB41_6
; GFX950-NEXT: .LBB41_2: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
; GFX950-NEXT: .LBB41_3: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB41_4: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB41_4
; GFX950-NEXT: ; %bb.5: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB41_2
; GFX950-NEXT: .LBB41_6: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v1, v7
; GFX950-NEXT: v_xor_b32_e32 v0, v0, v6
; GFX950-NEXT: scratch_store_dwordx2 v2, v[0:1], off
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%unused = atomicrmw xor ptr %ptr, i64 %data seq_cst
ret void
}
;---------------------------------------------------------------------
; xor i32 cases with instruction
;---------------------------------------------------------------------
; Input and result use AGPR
define void @flat_atomic_xor_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
; Input is AGPR, result used as VGPR.
define void @flat_atomic_xor_i32_ret_a_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_ret_a_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_ret_a_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "v"(i32 %result)
ret void
}
; Input is VGPR, result used as AGPR
define void @flat_atomic_xor_i32_ret_v_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_ret_v_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_ret_v_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=v"()
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
; Input is AV, result also used as AV
define void @flat_atomic_xor_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
; Input is AV, used as v
define void @flat_atomic_xor_i32_ret_av_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_ret_av_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "v"(i32 %result)
ret void
}
; Input is AV, used as a
define void @flat_atomic_xor_i32_ret_av_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_ret_av_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
; Input is a, result used as AV
define void @flat_atomic_xor_i32_ret_a_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_ret_a_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_ret_a_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
; Input is v, result used as AV
define void @flat_atomic_xor_i32_ret_v_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_ret_v_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_ret_v_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=v"()
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:31]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:31]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
%vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
%vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_xor_i32_noret_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_noret_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_xor v[0:1], a0
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_noret_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor v[0:1], a0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%unused = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
define void @flat_atomic_xor_i32_noret_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_noret_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_xor v[0:1], v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_noret_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor v[0:1], v2
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%unused = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
;---------------------------------------------------------------------
; xor i64 cases with instruction
;---------------------------------------------------------------------
; Input and result use AGPR
define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB53_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB53_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB53_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB53_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB53_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB53_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB53_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB53_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
; Input is AGPR, result used as VGPR.
define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB54_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB54_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB54_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB54_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_ret_a_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB54_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB54_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB54_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB54_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "v"(i64 %result)
ret void
}
; Input is VGPR, result used as AGPR
define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_ret_v_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB55_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB55_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB55_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB55_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_ret_v_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB55_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB55_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB55_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB55_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=v"()
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
; Input is AV, result also used as AV
define void @flat_atomic_xor_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB56_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB56_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB56_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB56_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB56_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB56_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB56_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB56_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
; Input is AV, used as v
define void @flat_atomic_xor_i64_ret_av_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB57_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB57_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB57_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB57_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_ret_av_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB57_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB57_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB57_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB57_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "v"(i64 %result)
ret void
}
; Input is AV, used as a
define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_ret_av_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB58_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB58_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB58_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB58_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_ret_av_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB58_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB58_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB58_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB58_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
; Input is a, result used as AV
define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB59_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB59_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB59_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB59_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_ret_a_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB59_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB59_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB59_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB59_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
; Input is v, result used as AV
define void @flat_atomic_xor_i64_ret_v_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_ret_v_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB60_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB60_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB60_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB60_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_ret_v_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB60_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB60_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB60_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB60_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=v"()
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_noret_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB61_3
; GFX90A-NEXT: ; %bb.1: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB61_4
; GFX90A-NEXT: .LBB61_2: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
; GFX90A-NEXT: .LBB61_3: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], a[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB61_2
; GFX90A-NEXT: .LBB61_4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_noret_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB61_3
; GFX950-NEXT: ; %bb.1: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB61_4
; GFX950-NEXT: .LBB61_2: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
; GFX950-NEXT: .LBB61_3: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], a[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB61_2
; GFX950-NEXT: .LBB61_4: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX950-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%unused = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
define void @flat_atomic_xor_i64_noret_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_noret_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB62_3
; GFX90A-NEXT: ; %bb.1: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB62_4
; GFX90A-NEXT: .LBB62_2: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
; GFX90A-NEXT: .LBB62_3: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB62_2
; GFX90A-NEXT: .LBB62_4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_noret_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB62_3
; GFX950-NEXT: ; %bb.1: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB62_4
; GFX950-NEXT: .LBB62_2: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
; GFX950-NEXT: .LBB62_3: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB62_2
; GFX950-NEXT: .LBB62_4: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX950-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%unused = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
;---------------------------------------------------------------------
; other atomics i32, with aa+av cases
;---------------------------------------------------------------------
define void @flat_atomic_add_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_add_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_add_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_add_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_sub_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_sub_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_sub_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_sub_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_and_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_and_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_and_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_and_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_nand_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4
; GFX90A-NEXT: v_not_b32_e32 v2, v2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB69_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_nand_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB69_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_nand_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_nand_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_and_b32_e32 v3, v5, v2
; GFX90A-NEXT: v_not_b32_e32 v4, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB70_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_nand_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_bitop3_b32 v4, v5, v3, v5 bitop3:0x3f
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB70_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_or_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_or_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_or_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_or_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_max_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_max_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_max_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_max_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_min_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_min_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_min_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_min_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_umax_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umax_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_umax_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umax_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_umin_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umin_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_umin_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umin_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_uinc_wrap_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_uinc_wrap_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_udec_wrap_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_udec_wrap_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_udec_wrap_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_udec_wrap_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_cond_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4
; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB85_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_cond_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4
; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB85_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_usub_cond_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_cond_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_sub_u32_e32 v3, v5, v2
; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB86_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_cond_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v3
; GFX950-NEXT: v_sub_u32_e32 v3, v5, v2
; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
; GFX950-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB86_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB87_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_sat_i32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB87_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_usub_sat_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v3 clamp
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB88_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_sat_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_sub_u32_e64 v4, v5, v3 clamp
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB88_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
;---------------------------------------------------------------------
; other atomics i64, with aa+av cases
;---------------------------------------------------------------------
define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_add_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB89_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB89_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB89_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB89_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB89_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB89_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB89_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB89_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_add_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB90_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB90_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB90_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB90_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB90_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB90_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB90_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB90_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_sub_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB91_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB91_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB91_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v1, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB91_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB91_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB91_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB91_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB91_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_sub_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB92_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB92_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB92_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB92_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB92_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB92_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB92_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB92_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_and_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB93_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB93_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB93_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v2, v4, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB93_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB93_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB93_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB93_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_and_b32_e32 v3, v1, v3
; GFX950-NEXT: v_and_b32_e32 v2, v0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB93_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_and_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB94_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB94_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB94_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3
; GFX90A-NEXT: v_and_b32_e32 v2, v0, v2
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB94_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB94_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB94_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB94_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v3, v1, v3
; GFX950-NEXT: v_and_b32_e32 v2, v0, v2
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB94_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_nand_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB95_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB95_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v0, v3, v7
; GFX90A-NEXT: v_and_b32_e32 v8, v2, v6
; GFX90A-NEXT: v_not_b32_e32 v1, v0
; GFX90A-NEXT: v_not_b32_e32 v0, v8
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB95_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB95_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB95_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_and_b32_e32 v3, v1, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v4, v2, v6
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_not_b32_e32 v2, v3
; GFX90A-NEXT: v_not_b32_e32 v3, v4
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB95_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_nand_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB95_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB95_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v0, v3, v7
; GFX950-NEXT: v_and_b32_e32 v8, v2, v6
; GFX950-NEXT: v_not_b32_e32 v1, v0
; GFX950-NEXT: v_not_b32_e32 v0, v8
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB95_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB95_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB95_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, v1, v7
; GFX950-NEXT: v_and_b32_e32 v5, v0, v6
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_not_b32_e32 v3, v2
; GFX950-NEXT: v_not_b32_e32 v2, v5
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB95_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_nand_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB96_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[2:3]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB96_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_and_b32_e32 v4, v7, v1
; GFX90A-NEXT: v_and_b32_e32 v8, v6, v0
; GFX90A-NEXT: v_not_b32_e32 v5, v4
; GFX90A-NEXT: v_not_b32_e32 v4, v8
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[2:3], v[4:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB96_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB96_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB96_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v0, v4, v0
; GFX90A-NEXT: v_not_b32_e32 v0, v0
; GFX90A-NEXT: v_not_b32_e32 v1, v1
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB96_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_nand_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB96_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB96_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
; GFX950-NEXT: v_and_b32_e32 v2, v9, v1
; GFX950-NEXT: v_and_b32_e32 v3, v8, v0
; GFX950-NEXT: v_not_b32_e32 v7, v2
; GFX950-NEXT: v_not_b32_e32 v6, v3
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB96_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB96_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB96_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v1, v3, v1
; GFX950-NEXT: v_and_b32_e32 v0, v2, v0
; GFX950-NEXT: v_not_b32_e32 v1, v1
; GFX950-NEXT: v_not_b32_e32 v0, v0
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB96_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_or_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB97_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB97_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB97_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_or_b32_e32 v2, v4, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB97_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB97_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB97_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB97_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_or_b32_e32 v3, v1, v3
; GFX950-NEXT: v_or_b32_e32 v2, v0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB97_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_or_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB98_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB98_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB98_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3
; GFX90A-NEXT: v_or_b32_e32 v2, v0, v2
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB98_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB98_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB98_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB98_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_or_b32_e32 v3, v1, v3
; GFX950-NEXT: v_or_b32_e32 v2, v0, v2
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB98_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_max_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB99_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB99_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB99_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB99_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB99_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB99_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB99_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB99_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_max_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB100_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB100_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB100_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB100_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB100_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB100_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB100_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB100_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_min_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB101_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB101_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB101_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB101_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB101_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB101_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB101_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB101_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_min_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB102_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB102_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB102_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB102_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB102_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB102_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB102_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB102_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umax_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB103_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB103_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB103_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB103_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB103_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB103_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB103_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB103_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umax_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB104_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB104_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB104_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB104_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB104_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB104_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB104_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB104_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umin_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB105_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB105_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB105_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB105_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB105_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB105_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB105_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB105_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umin_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB106_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB106_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB106_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB106_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB106_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB106_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB106_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB106_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB107_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB107_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB107_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB107_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB107_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB107_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB107_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB107_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB108_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB108_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB108_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB108_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB108_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB108_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB108_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB108_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_udec_wrap_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB109_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB109_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB109_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB109_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB109_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB109_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB109_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1]
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB109_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_udec_wrap_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB110_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB110_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB110_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB110_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB110_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB110_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB110_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB110_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB111_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB111_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB111_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB111_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB111_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB111_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB111_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB111_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB111_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB111_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB111_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB111_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB112_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[2:3]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB112_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v0
; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v1, vcc
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[2:3], v[4:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB112_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB112_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB112_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v4, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v1, vcc
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB112_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB112_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB112_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB112_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB112_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB112_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v5, vcc, v2, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v1, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB112_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB113_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB113_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB113_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB113_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB113_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v1, v6
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v7, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB113_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB113_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB113_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB113_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB113_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB113_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB114_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB114_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB114_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB114_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB114_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, v4, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v3, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB114_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB114_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB114_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB114_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB114_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB114_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB114_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
;---------------------------------------------------------------------
; other atomics f32, with aa+av cases
;---------------------------------------------------------------------
define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_f32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB115_6
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
; GFX90A-NEXT: ; implicit-def: $agpr0
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB115_3
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB115_3: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB115_5
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f32_e32 v2, v1, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: .LBB115_5: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: .LBB115_6: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB115_8
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: .LBB115_8: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fadd_f32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_f32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB116_6
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB116_3
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: global_atomic_add_f32 v2, v[0:1], v3, off glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr3
; GFX90A-NEXT: .LBB116_3: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB116_5
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f32_e32 v1, v2, v3
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: .LBB116_5: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr3
; GFX90A-NEXT: .LBB116_6: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB116_8
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ds_add_rtn_f32 v2, v0, v3
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB116_8: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_f32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB117_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB117_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_f32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB117_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB117_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_f32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB118_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB118_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_f32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB118_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB118_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB119_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB119_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_max_f32_e32 v4, v2, v2
; GFX950-NEXT: .LBB119_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB119_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3
; GFX90A-NEXT: .LBB120_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB120_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
; GFX950-NEXT: .LBB120_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_max_f32_e32 v2, v5, v5
; GFX950-NEXT: v_max_f32_e32 v4, v2, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB120_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB121_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB121_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_max_f32_e32 v4, v2, v2
; GFX950-NEXT: .LBB121_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB121_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3
; GFX90A-NEXT: .LBB122_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5
; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB122_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
; GFX950-NEXT: .LBB122_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_max_f32_e32 v2, v5, v5
; GFX950-NEXT: v_min_f32_e32 v4, v2, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB122_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_f32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: .LBB123_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB123_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_f32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB123_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB123_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fmaximum_f32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_f32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB124_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB124_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_f32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB124_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_f32_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: .LBB125_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB125_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_f32_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB125_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB125_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fminimum_f32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_f32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB126_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_f32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB126_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
;---------------------------------------------------------------------
; other atomics f64, with aa+av cases
;---------------------------------------------------------------------
define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB127_6
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB127_3
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB127_3: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB127_5
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB127_5: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB127_6: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB127_8
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: .LBB127_8: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB127_6
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX950-NEXT: s_cbranch_execz .LBB127_3
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB127_3: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
; GFX950-NEXT: s_cbranch_execz .LBB127_5
; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB127_5: ; %Flow1
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB127_6: ; %Flow2
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB127_8
; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: .LBB127_8: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_f64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB128_6
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v3
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB128_3
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off glc
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB128_3: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB128_5
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB128_5: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB128_6: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB128_8
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB128_8: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB128_6
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX950-NEXT: s_cbranch_execz .LBB128_3
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB128_3: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
; GFX950-NEXT: s_cbranch_execz .LBB128_5
; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB128_5: ; %Flow1
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB128_6: ; %Flow2
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB128_8
; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc
; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: .LBB128_8: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB129_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB129_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7]
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB129_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB129_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB129_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB129_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_f64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB129_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB129_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7]
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB129_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB129_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB129_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB129_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_f64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB130_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB130_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5]
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB130_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB130_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB130_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB130_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_f64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB130_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB130_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX950-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB130_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB130_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB130_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3]
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB130_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB131_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB131_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB131_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB131_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB131_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB131_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB131_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB131_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB132_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB132_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB132_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB132_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB132_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB132_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB132_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB132_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB133_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB133_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB133_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB133_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB133_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB133_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB133_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB133_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB134_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB134_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB134_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB134_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB134_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB134_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB134_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB134_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB135_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
; GFX90A-NEXT: .LBB135_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB135_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB135_4: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB135_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB135_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
; GFX950-NEXT: .LBB135_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB135_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB135_4: ; %Flow2
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB135_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB135_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_f64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB136_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX90A-NEXT: .LBB136_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB136_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB136_4: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB136_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB136_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB136_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: .LBB136_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3]
; GFX950-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB136_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB136_4: ; %Flow2
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB136_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB136_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB137_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
; GFX90A-NEXT: .LBB137_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB137_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB137_4: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB137_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_f64_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB137_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
; GFX950-NEXT: .LBB137_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB137_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB137_4: ; %Flow2
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB137_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB137_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_f64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB138_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX90A-NEXT: .LBB138_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB138_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB138_4: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB138_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB138_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_f64_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB138_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: .LBB138_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3]
; GFX950-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB138_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB138_4: ; %Flow2
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB138_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB138_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
;---------------------------------------------------------------------
; other atomics v2f16, with aa+av cases
;---------------------------------------------------------------------
define void @flat_atomic_fadd_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_v2f16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB139_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB139_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_v2f16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fadd_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB140_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB140_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_v2f16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB141_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB141_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_v2f16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB141_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB141_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fsub_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB142_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB142_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB142_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB142_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_v2f16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB143_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB143_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_v2f16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
; GFX950-NEXT: .LBB143_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_pk_max_f16 v2, v2, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB143_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmax_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3
; GFX90A-NEXT: .LBB144_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
; GFX90A-NEXT: v_pk_max_f16 v4, v2, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB144_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_pk_max_f16 v3, v3, v3
; GFX950-NEXT: .LBB144_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_pk_max_f16 v2, v5, v5
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB144_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_v2f16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB145_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB145_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_v2f16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
; GFX950-NEXT: .LBB145_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_pk_min_f16 v2, v2, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB145_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmin_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3
; GFX90A-NEXT: .LBB146_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
; GFX90A-NEXT: v_pk_min_f16 v4, v2, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB146_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_pk_max_f16 v3, v3, v3
; GFX950-NEXT: .LBB146_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_pk_max_f16 v2, v5, v5
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_pk_min_f16 v4, v2, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB146_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: .LBB147_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_max_f16 v2, v3, v4
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB147_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_v2f16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB147_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB147_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmaximum_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB148_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB148_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB148_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v3, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB148_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_v2f16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: .LBB149_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_min_f16 v2, v3, v4
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB149_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_v2f16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB149_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB149_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fminimum_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB150_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB150_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB150_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v3, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB150_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
;---------------------------------------------------------------------
; other atomics v2bf16, with aa+av cases
;---------------------------------------------------------------------
define void @flat_atomic_fadd_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_v2bf16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB151_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB151_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_v2bf16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fadd_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB152_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_add_f32_e32 v6, v6, v3
; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB152_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_v2bf16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB153_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB153_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_v2bf16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB153_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX950-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB153_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fsub_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB154_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v3
; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB154_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX950-NEXT: .LBB154_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v4
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX950-NEXT: v_sub_f32_e32 v6, v6, v3
; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB154_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_v2bf16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB155_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB155_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_v2bf16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB155_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
; GFX950-NEXT: v_max_f32_e32 v6, v6, v5
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB155_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmax_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB156_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_max_f32_e32 v6, v6, v3
; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB156_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX950-NEXT: .LBB156_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v4
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX950-NEXT: v_max_f32_e32 v4, v4, v2
; GFX950-NEXT: v_max_f32_e32 v6, v6, v3
; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB156_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_v2bf16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB157_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB157_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_v2bf16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB157_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
; GFX950-NEXT: v_min_f32_e32 v6, v6, v5
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB157_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmin_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB158_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_min_f32_e32 v6, v6, v3
; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB158_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX950-NEXT: .LBB158_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v4
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX950-NEXT: v_min_f32_e32 v4, v4, v2
; GFX950-NEXT: v_min_f32_e32 v6, v6, v3
; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB158_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB159_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
; GFX90A-NEXT: v_max_f32_e32 v8, v2, v4
; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB159_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB159_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX950-NEXT: v_maximum3_f32 v2, v2, v4, v4
; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB159_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmaximum_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB160_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
; GFX90A-NEXT: v_max_f32_e32 v8, v5, v2
; GFX90A-NEXT: v_max_f32_e32 v9, v6, v4
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9
; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB160_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v5
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX950-NEXT: .LBB160_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v4
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2
; GFX950-NEXT: v_maximum3_f32 v6, v6, v3, v3
; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB160_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB161_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
; GFX90A-NEXT: v_min_f32_e32 v8, v2, v4
; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB161_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_v2bf16_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB161_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX950-NEXT: v_minimum3_f32 v2, v2, v4, v4
; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB161_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fminimum_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB162_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
; GFX90A-NEXT: v_min_f32_e32 v8, v5, v2
; GFX90A-NEXT: v_min_f32_e32 v9, v6, v4
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9
; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB162_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v5
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX950-NEXT: .LBB162_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v4
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2
; GFX950-NEXT: v_minimum3_f32 v6, v6, v3, v3
; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB162_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
;---------------------------------------------------------------------
; other atomics i32, with aa+av cases using saddr
;---------------------------------------------------------------------
define void @flat_atomic_xchg_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw xchg ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_xchg_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw xchg ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_add_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_add_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_add_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_add_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_sub_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_sub_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_sub_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_sub_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_and_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_and_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_and_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_and_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_nand_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB171_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4
; GFX90A-NEXT: v_not_b32_e32 v2, v2
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB171_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_nand_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB171_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB171_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_nand_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB172_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_and_b32_e32 v3, v5, v2
; GFX90A-NEXT: v_not_b32_e32 v4, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB172_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_nand_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB172_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_bitop3_b32 v4, v5, v3, v5 bitop3:0x3f
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB172_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_or_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_or_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_or_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_or_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_xor_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_xor_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_max_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_max_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_max_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_max_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_min_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_min_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_min_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_min_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_umax_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umax_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_umax_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umax_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_umin_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umin_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_umin_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umin_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_uinc_wrap_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_uinc_wrap_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_udec_wrap_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_udec_wrap_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_cond_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB189_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4
; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB189_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_cond_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB189_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4
; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB189_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_usub_cond_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_cond_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB190_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_sub_u32_e32 v3, v5, v2
; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB190_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_cond_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB190_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v3
; GFX950-NEXT: v_sub_u32_e32 v3, v5, v2
; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
; GFX950-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB190_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB191_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB191_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_sat_i32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB191_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB191_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=a"()
%result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i32 %result)
ret void
}
define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB192_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v3 clamp
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB192_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_sat_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB192_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_sub_u32_e64 v4, v5, v3 clamp
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB192_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
%result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i32 %result)
ret void
}
;---------------------------------------------------------------------
; other atomics i64, with aa+av cases using saddr
;---------------------------------------------------------------------
define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB193_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB193_3
; GFX90A-NEXT: s_branch .LBB193_4
; GFX90A-NEXT: .LBB193_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB193_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NEXT: buffer_load_dword a0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword a1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB193_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB193_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
; GFX950-NEXT: s_cbranch_execz .LBB193_3
; GFX950-NEXT: s_branch .LBB193_4
; GFX950-NEXT: .LBB193_2:
; GFX950-NEXT: ; implicit-def: $agpr2_agpr3
; GFX950-NEXT: .LBB193_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 a[2:3], off, s0
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 off, a[0:1], s0
; GFX950-NEXT: .LBB193_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw xchg ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB194_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB194_3
; GFX90A-NEXT: s_branch .LBB194_4
; GFX90A-NEXT: .LBB194_2:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB194_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB194_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB194_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB194_3
; GFX950-NEXT: s_branch .LBB194_4
; GFX950-NEXT: .LBB194_2:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB194_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB194_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw xchg ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_add_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB195_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB195_3
; GFX90A-NEXT: s_branch .LBB195_4
; GFX90A-NEXT: .LBB195_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB195_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB195_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB195_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB195_3
; GFX950-NEXT: s_branch .LBB195_4
; GFX950-NEXT: .LBB195_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB195_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB195_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_add_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB196_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB196_3
; GFX90A-NEXT: s_branch .LBB196_4
; GFX90A-NEXT: .LBB196_2:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB196_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB196_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB196_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB196_3
; GFX950-NEXT: s_branch .LBB196_4
; GFX950-NEXT: .LBB196_2:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB196_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB196_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_sub_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB197_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB197_3
; GFX90A-NEXT: s_branch .LBB197_4
; GFX90A-NEXT: .LBB197_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB197_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB197_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB197_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB197_3
; GFX950-NEXT: s_branch .LBB197_4
; GFX950-NEXT: .LBB197_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB197_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB197_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_sub_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB198_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB198_3
; GFX90A-NEXT: s_branch .LBB198_4
; GFX90A-NEXT: .LBB198_2:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB198_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB198_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB198_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB198_3
; GFX950-NEXT: s_branch .LBB198_4
; GFX950-NEXT: .LBB198_2:
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB198_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB198_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_and_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB199_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB199_3
; GFX90A-NEXT: s_branch .LBB199_4
; GFX90A-NEXT: .LBB199_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB199_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v0, v4, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB199_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB199_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB199_3
; GFX950-NEXT: s_branch .LBB199_4
; GFX950-NEXT: .LBB199_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB199_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_and_b32_e32 v1, v3, v1
; GFX950-NEXT: v_and_b32_e32 v0, v2, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB199_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_and_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB200_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB200_3
; GFX90A-NEXT: s_branch .LBB200_4
; GFX90A-NEXT: .LBB200_2:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB200_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1
; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB200_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB200_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB200_3
; GFX950-NEXT: s_branch .LBB200_4
; GFX950-NEXT: .LBB200_2:
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB200_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v3, v1, v3
; GFX950-NEXT: v_and_b32_e32 v2, v0, v2
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB200_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_nand_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_cbranch_vccz .LBB201_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB201_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5
; GFX90A-NEXT: v_and_b32_e32 v8, v2, v4
; GFX90A-NEXT: v_not_b32_e32 v1, v0
; GFX90A-NEXT: v_not_b32_e32 v0, v8
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB201_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB201_6
; GFX90A-NEXT: .LBB201_4:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB201_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v4, v2, v4
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_not_b32_e32 v2, v3
; GFX90A-NEXT: v_not_b32_e32 v3, v4
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB201_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_nand_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: s_cbranch_vccz .LBB201_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB201_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v0, v3, v5
; GFX950-NEXT: v_and_b32_e32 v8, v2, v4
; GFX950-NEXT: v_not_b32_e32 v1, v0
; GFX950-NEXT: v_not_b32_e32 v0, v8
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB201_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB201_6
; GFX950-NEXT: .LBB201_4:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB201_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, v1, v5
; GFX950-NEXT: v_and_b32_e32 v4, v0, v4
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_not_b32_e32 v3, v2
; GFX950-NEXT: v_not_b32_e32 v2, v4
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB201_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_nand_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB202_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB202_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_and_b32_e32 v2, v9, v1
; GFX90A-NEXT: v_and_b32_e32 v3, v8, v0
; GFX90A-NEXT: v_not_b32_e32 v7, v2
; GFX90A-NEXT: v_not_b32_e32 v6, v3
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB202_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB202_6
; GFX90A-NEXT: .LBB202_4:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_cbranch_execz .LBB202_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0
; GFX90A-NEXT: v_not_b32_e32 v0, v0
; GFX90A-NEXT: v_not_b32_e32 v1, v1
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB202_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_nand_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB202_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB202_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
; GFX950-NEXT: v_and_b32_e32 v2, v9, v1
; GFX950-NEXT: v_and_b32_e32 v3, v8, v0
; GFX950-NEXT: v_not_b32_e32 v7, v2
; GFX950-NEXT: v_not_b32_e32 v6, v3
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB202_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB202_6
; GFX950-NEXT: .LBB202_4:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_cbranch_execz .LBB202_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v1, v3, v1
; GFX950-NEXT: v_and_b32_e32 v0, v2, v0
; GFX950-NEXT: v_not_b32_e32 v1, v1
; GFX950-NEXT: v_not_b32_e32 v0, v0
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB202_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_or_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB203_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB203_3
; GFX90A-NEXT: s_branch .LBB203_4
; GFX90A-NEXT: .LBB203_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB203_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_or_b32_e32 v0, v4, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB203_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB203_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB203_3
; GFX950-NEXT: s_branch .LBB203_4
; GFX950-NEXT: .LBB203_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB203_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_or_b32_e32 v1, v3, v1
; GFX950-NEXT: v_or_b32_e32 v0, v2, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB203_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_or_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB204_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB204_3
; GFX90A-NEXT: s_branch .LBB204_4
; GFX90A-NEXT: .LBB204_2:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB204_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1
; GFX90A-NEXT: v_or_b32_e32 v0, v2, v0
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB204_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB204_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB204_3
; GFX950-NEXT: s_branch .LBB204_4
; GFX950-NEXT: .LBB204_2:
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB204_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_or_b32_e32 v3, v1, v3
; GFX950-NEXT: v_or_b32_e32 v2, v0, v2
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB204_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB205_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB205_3
; GFX90A-NEXT: s_branch .LBB205_4
; GFX90A-NEXT: .LBB205_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB205_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v0, v4, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB205_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB205_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB205_3
; GFX950-NEXT: s_branch .LBB205_4
; GFX950-NEXT: .LBB205_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB205_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v1
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB205_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw xor ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB206_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB206_3
; GFX90A-NEXT: s_branch .LBB206_4
; GFX90A-NEXT: .LBB206_2:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB206_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1
; GFX90A-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB206_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB206_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB206_3
; GFX950-NEXT: s_branch .LBB206_4
; GFX950-NEXT: .LBB206_2:
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB206_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB206_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw xor ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_max_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB207_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB207_3
; GFX90A-NEXT: s_branch .LBB207_4
; GFX90A-NEXT: .LBB207_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB207_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB207_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB207_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB207_3
; GFX950-NEXT: s_branch .LBB207_4
; GFX950-NEXT: .LBB207_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB207_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB207_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_max_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB208_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB208_3
; GFX90A-NEXT: s_branch .LBB208_4
; GFX90A-NEXT: .LBB208_2:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB208_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB208_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB208_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB208_3
; GFX950-NEXT: s_branch .LBB208_4
; GFX950-NEXT: .LBB208_2:
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB208_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB208_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_min_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB209_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB209_3
; GFX90A-NEXT: s_branch .LBB209_4
; GFX90A-NEXT: .LBB209_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB209_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB209_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB209_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB209_3
; GFX950-NEXT: s_branch .LBB209_4
; GFX950-NEXT: .LBB209_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB209_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB209_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_min_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB210_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB210_3
; GFX90A-NEXT: s_branch .LBB210_4
; GFX90A-NEXT: .LBB210_2:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB210_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB210_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB210_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB210_3
; GFX950-NEXT: s_branch .LBB210_4
; GFX950-NEXT: .LBB210_2:
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB210_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB210_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umax_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB211_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB211_3
; GFX90A-NEXT: s_branch .LBB211_4
; GFX90A-NEXT: .LBB211_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB211_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB211_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB211_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB211_3
; GFX950-NEXT: s_branch .LBB211_4
; GFX950-NEXT: .LBB211_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB211_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB211_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umax_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB212_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB212_3
; GFX90A-NEXT: s_branch .LBB212_4
; GFX90A-NEXT: .LBB212_2:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB212_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB212_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB212_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB212_3
; GFX950-NEXT: s_branch .LBB212_4
; GFX950-NEXT: .LBB212_2:
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB212_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB212_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umin_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB213_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB213_3
; GFX90A-NEXT: s_branch .LBB213_4
; GFX90A-NEXT: .LBB213_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB213_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB213_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB213_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB213_3
; GFX950-NEXT: s_branch .LBB213_4
; GFX950-NEXT: .LBB213_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB213_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB213_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_umin_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB214_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB214_3
; GFX90A-NEXT: s_branch .LBB214_4
; GFX90A-NEXT: .LBB214_2:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB214_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB214_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB214_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB214_3
; GFX950-NEXT: s_branch .LBB214_4
; GFX950-NEXT: .LBB214_2:
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB214_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB214_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB215_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB215_3
; GFX90A-NEXT: s_branch .LBB215_4
; GFX90A-NEXT: .LBB215_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB215_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB215_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB215_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB215_3
; GFX950-NEXT: s_branch .LBB215_4
; GFX950-NEXT: .LBB215_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB215_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, 1
; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB215_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_uinc_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB216_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB216_3
; GFX90A-NEXT: s_branch .LBB216_4
; GFX90A-NEXT: .LBB216_2:
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB216_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB216_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB216_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB216_3
; GFX950-NEXT: s_branch .LBB216_4
; GFX950-NEXT: .LBB216_2:
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB216_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB216_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB217_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB217_3
; GFX90A-NEXT: s_branch .LBB217_4
; GFX90A-NEXT: .LBB217_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB217_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB217_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB217_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB217_3
; GFX950-NEXT: s_branch .LBB217_4
; GFX950-NEXT: .LBB217_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB217_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1]
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s2
; GFX950-NEXT: .LBB217_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB218_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB218_3
; GFX90A-NEXT: s_branch .LBB218_4
; GFX90A-NEXT: .LBB218_2:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB218_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB218_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB218_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB218_3
; GFX950-NEXT: s_branch .LBB218_4
; GFX950-NEXT: .LBB218_2:
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB218_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s2
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s2
; GFX950-NEXT: .LBB218_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_cond_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_cbranch_vccz .LBB219_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB219_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB219_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB219_6
; GFX90A-NEXT: .LBB219_4:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB219_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB219_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_cond_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: s_cbranch_vccz .LBB219_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB219_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB219_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB219_6
; GFX950-NEXT: .LBB219_4:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB219_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB219_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_cond_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB220_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB220_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB220_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB220_6
; GFX90A-NEXT: .LBB220_4:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_cbranch_execz .LBB220_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, v2, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v1, vcc
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB220_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_cond_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB220_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB220_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB220_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB220_6
; GFX950-NEXT: .LBB220_4:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_cbranch_execz .LBB220_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v1, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB220_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_cbranch_vccz .LBB221_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB221_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB221_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB221_6
; GFX90A-NEXT: .LBB221_4:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB221_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v1, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_sat_i64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: s_cbranch_vccz .LBB221_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB221_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB221_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB221_6
; GFX950-NEXT: .LBB221_4:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB221_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB221_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
%result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(i64 %result)
ret void
}
define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB222_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB222_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB222_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB222_6
; GFX90A-NEXT: .LBB222_4:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_cbranch_execz .LBB222_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB222_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_usub_sat_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB222_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB222_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB222_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB222_6
; GFX950-NEXT: .LBB222_4:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_cbranch_execz .LBB222_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB222_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=^VA"()
%result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(i64 %result)
ret void
}
;---------------------------------------------------------------------
; other atomics f32, with aa+av cases using saddr
;---------------------------------------------------------------------
define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 40
; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_cbranch_vccz .LBB223_3
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_vccz .LBB223_4
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: s_cbranch_execz .LBB223_5
; GFX90A-NEXT: s_branch .LBB223_6
; GFX90A-NEXT: .LBB223_3:
; GFX90A-NEXT: ; implicit-def: $agpr0
; GFX90A-NEXT: s_branch .LBB223_7
; GFX90A-NEXT: .LBB223_4:
; GFX90A-NEXT: ; implicit-def: $agpr0
; GFX90A-NEXT: .LBB223_5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f32_e32 v3, v2, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
; GFX90A-NEXT: .LBB223_6: ; %Flow1
; GFX90A-NEXT: s_cbranch_execnz .LBB223_8
; GFX90A-NEXT: .LBB223_7: ; %atomicrmw.shared
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
; GFX90A-NEXT: ds_add_rtn_f32 v0, v1, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: .LBB223_8: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fadd_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_f32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 40
; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB224_3
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_vccz .LBB224_4
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc
; GFX90A-NEXT: s_cbranch_execz .LBB224_5
; GFX90A-NEXT: s_branch .LBB224_6
; GFX90A-NEXT: .LBB224_3:
; GFX90A-NEXT: ; implicit-def: $vgpr1
; GFX90A-NEXT: s_branch .LBB224_7
; GFX90A-NEXT: .LBB224_4:
; GFX90A-NEXT: ; implicit-def: $vgpr1
; GFX90A-NEXT: .LBB224_5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f32_e32 v3, v1, v0
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GFX90A-NEXT: .LBB224_6: ; %Flow1
; GFX90A-NEXT: s_cbranch_execnz .LBB224_8
; GFX90A-NEXT: .LBB224_7: ; %atomicrmw.shared
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
; GFX90A-NEXT: ds_add_rtn_f32 v1, v1, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB224_8: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v1
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_f32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB225_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB225_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_f32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB225_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB225_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_f32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB226_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB226_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_f32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB226_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB226_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB227_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB227_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_max_f32_e32 v4, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB227_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB227_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fmax_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB228_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB228_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_max_f32_e32 v3, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB228_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_max_f32_e32 v2, v5, v5
; GFX950-NEXT: v_max_f32_e32 v4, v2, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB228_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB229_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB229_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_max_f32_e32 v4, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB229_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB229_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fmin_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB230_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5
; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB230_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_max_f32_e32 v3, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB230_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_max_f32_e32 v2, v5, v5
; GFX950-NEXT: v_min_f32_e32 v4, v2, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB230_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_f32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: .LBB231_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB231_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_f32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB231_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB231_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_f32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB232_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB232_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_f32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB232_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_f32_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: .LBB233_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB233_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_f32_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB233_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB233_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=a"()
%result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "a"(float %result)
ret void
}
define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_f32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB234_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB234_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_f32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB234_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
%data = call float asm "; def $0", "=^VA"()
%result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
call void asm "; use $0", "^VA"(float %result)
ret void
}
;---------------------------------------------------------------------
; other atomics f64, with aa+av cases using saddr
;---------------------------------------------------------------------
define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB235_3
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_vccz .LBB235_4
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB235_5
; GFX90A-NEXT: s_branch .LBB235_6
; GFX90A-NEXT: .LBB235_3:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_branch .LBB235_7
; GFX90A-NEXT: .LBB235_4:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB235_5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s6
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB235_6: ; %Flow1
; GFX90A-NEXT: s_cbranch_execnz .LBB235_8
; GFX90A-NEXT: .LBB235_7: ; %atomicrmw.shared
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: .LBB235_8: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_shared_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB235_3
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: s_cbranch_vccz .LBB235_4
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b32_e32 v2, 0
; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB235_5
; GFX950-NEXT: s_branch .LBB235_6
; GFX950-NEXT: .LBB235_3:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_branch .LBB235_7
; GFX950-NEXT: .LBB235_4:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB235_5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2
; GFX950-NEXT: .LBB235_6: ; %Flow1
; GFX950-NEXT: s_cbranch_execnz .LBB235_8
; GFX950-NEXT: .LBB235_7: ; %atomicrmw.shared
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: v_mov_b32_e32 v2, s0
; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1]
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: .LBB235_8: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fadd_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_f64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB236_3
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_vccz .LBB236_4
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc
; GFX90A-NEXT: s_cbranch_execz .LBB236_5
; GFX90A-NEXT: s_branch .LBB236_6
; GFX90A-NEXT: .LBB236_3:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_branch .LBB236_7
; GFX90A-NEXT: .LBB236_4:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB236_5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s6
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB236_6: ; %Flow1
; GFX90A-NEXT: s_cbranch_execnz .LBB236_8
; GFX90A-NEXT: .LBB236_7: ; %atomicrmw.shared
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB236_8: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_shared_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB236_3
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: s_cbranch_vccz .LBB236_4
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b32_e32 v2, 0
; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0
; GFX950-NEXT: s_cbranch_execz .LBB236_5
; GFX950-NEXT: s_branch .LBB236_6
; GFX950-NEXT: .LBB236_3:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_branch .LBB236_7
; GFX950-NEXT: .LBB236_4:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB236_5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2
; GFX950-NEXT: .LBB236_6: ; %Flow1
; GFX950-NEXT: s_cbranch_execnz .LBB236_8
; GFX950-NEXT: .LBB236_7: ; %atomicrmw.shared
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v2, s0
; GFX950-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: .LBB236_8: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_f64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_cbranch_vccz .LBB237_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB237_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB237_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB237_6
; GFX90A-NEXT: .LBB237_4:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB237_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB237_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_f64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: s_cbranch_vccz .LBB237_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB237_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB237_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB237_6
; GFX950-NEXT: .LBB237_4:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB237_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB237_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_f64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB238_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB238_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1]
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB238_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB238_6
; GFX90A-NEXT: .LBB238_4:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_cbranch_execz .LBB238_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1]
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB238_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_f64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB238_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB238_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
; GFX950-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1]
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB238_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB238_6
; GFX950-NEXT: .LBB238_4:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_cbranch_execz .LBB238_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1]
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB238_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB239_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB239_3
; GFX90A-NEXT: s_branch .LBB239_4
; GFX90A-NEXT: .LBB239_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB239_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB239_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB239_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB239_3
; GFX950-NEXT: s_branch .LBB239_4
; GFX950-NEXT: .LBB239_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB239_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB239_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fmax_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB240_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB240_3
; GFX90A-NEXT: s_branch .LBB240_4
; GFX90A-NEXT: .LBB240_2:
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB240_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB240_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB240_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB240_3
; GFX950-NEXT: s_branch .LBB240_4
; GFX950-NEXT: .LBB240_2:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB240_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1]
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB240_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB241_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB241_3
; GFX90A-NEXT: s_branch .LBB241_4
; GFX90A-NEXT: .LBB241_2:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB241_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX90A-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1]
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB241_4: ; %atomicrmw.end
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB241_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB241_3
; GFX950-NEXT: s_branch .LBB241_4
; GFX950-NEXT: .LBB241_2:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB241_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB241_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fmin_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB242_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_execz .LBB242_3
; GFX90A-NEXT: s_branch .LBB242_4
; GFX90A-NEXT: .LBB242_2:
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB242_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB242_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB242_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: s_cbranch_execz .LBB242_3
; GFX950-NEXT: s_branch .LBB242_4
; GFX950-NEXT: .LBB242_2:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB242_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1]
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB242_4: ; %atomicrmw.end
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_f64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_cbranch_vccz .LBB243_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
; GFX90A-NEXT: .LBB243_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB243_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB243_6
; GFX90A-NEXT: .LBB243_4:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB243_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_f64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: s_cbranch_vccz .LBB243_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
; GFX950-NEXT: .LBB243_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB243_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB243_6
; GFX950-NEXT: .LBB243_4:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB243_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB243_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_f64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB244_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX90A-NEXT: .LBB244_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB244_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB244_6
; GFX90A-NEXT: .LBB244_4:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_cbranch_execz .LBB244_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB244_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_f64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB244_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: .LBB244_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3]
; GFX950-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB244_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB244_6
; GFX950-NEXT: .LBB244_4:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_cbranch_execz .LBB244_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB244_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_f64_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_cbranch_vccz .LBB245_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
; GFX90A-NEXT: .LBB245_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB245_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB245_6
; GFX90A-NEXT: .LBB245_4:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB245_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_f64_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: s_cbranch_vccz .LBB245_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
; GFX950-NEXT: .LBB245_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB245_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB245_6
; GFX950-NEXT: .LBB245_4:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB245_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB245_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
%result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(double %result)
ret void
}
define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_f64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB246_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX90A-NEXT: .LBB246_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc
; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB246_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB246_6
; GFX90A-NEXT: .LBB246_4:
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_cbranch_execz .LBB246_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB246_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_f64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX950-NEXT: s_addc_u32 s1, s1, 0
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB246_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: .LBB246_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3]
; GFX950-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB246_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB246_6
; GFX950-NEXT: .LBB246_4:
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_cbranch_execz .LBB246_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB246_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=^VA"()
%result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(double %result)
ret void
}
;---------------------------------------------------------------------
; other atomics v2f16, with aa+av cases using saddr
;---------------------------------------------------------------------
define void @flat_atomic_fadd_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_v2f16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB247_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB247_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_v2f16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fadd_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_v2f16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB248_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB248_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_v2f16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_v2f16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB249_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB249_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_v2f16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB249_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB249_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_v2f16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB250_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB250_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_v2f16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB250_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB250_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_v2f16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB251_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB251_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_v2f16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_pk_max_f16 v2, v2, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB251_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmax_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_v2f16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB252_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
; GFX90A-NEXT: v_pk_max_f16 v4, v2, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB252_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_v2f16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_pk_max_f16 v3, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB252_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_pk_max_f16 v2, v5, v5
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB252_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_v2f16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB253_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB253_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_v2f16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_pk_min_f16 v2, v2, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB253_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmin_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_v2f16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB254_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
; GFX90A-NEXT: v_pk_min_f16 v4, v2, v3
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB254_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_v2f16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_pk_max_f16 v3, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB254_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_pk_max_f16 v2, v5, v5
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_pk_min_f16 v4, v2, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB254_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB255_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_max_f16 v0, v1, v4
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB255_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB255_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB255_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB256_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB256_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB256_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v3, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB256_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB257_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_min_f16 v0, v1, v4
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB257_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB257_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB257_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=a"()
%result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x half> %result)
ret void
}
define void @flat_atomic_fminimum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB258_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB258_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB258_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v2
; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v3, v3
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB258_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
%data = call <2 x half> asm "; def $0", "=^VA"()
%result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x half> %result)
ret void
}
;---------------------------------------------------------------------
; other atomics v2bf16, with aa+av cases using saddr
;---------------------------------------------------------------------
define void @flat_atomic_fadd_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB259_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX90A-NEXT: v_add_f32_e32 v0, v0, v4
; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB259_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fadd_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB260_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_add_f32_e32 v6, v6, v3
; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB260_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_mov_b32_e32 v1, s1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB261_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v4
; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB261_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB261_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX950-NEXT: v_sub_f32_e32 v0, v0, v4
; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB261_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fsub_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB262_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v3
; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB262_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB262_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v4
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX950-NEXT: v_sub_f32_e32 v6, v6, v3
; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB262_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB263_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4
; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB263_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB263_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
; GFX950-NEXT: v_max_f32_e32 v6, v6, v5
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB263_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmax_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB264_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_max_f32_e32 v6, v6, v3
; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB264_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB264_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v4
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX950-NEXT: v_max_f32_e32 v4, v4, v2
; GFX950-NEXT: v_max_f32_e32 v6, v6, v3
; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB264_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB265_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4
; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB265_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB265_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
; GFX950-NEXT: v_min_f32_e32 v6, v6, v5
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB265_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmin_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB266_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_min_f32_e32 v6, v6, v3
; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB266_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB266_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v4
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX950-NEXT: v_min_f32_e32 v4, v4, v2
; GFX950-NEXT: v_min_f32_e32 v6, v6, v3
; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB266_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB267_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX90A-NEXT: v_max_f32_e32 v8, v0, v4
; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB267_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB267_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX950-NEXT: v_maximum3_f32 v0, v0, v4, v4
; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB267_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB268_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
; GFX90A-NEXT: v_max_f32_e32 v8, v5, v2
; GFX90A-NEXT: v_max_f32_e32 v9, v6, v4
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9
; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB268_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v5
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB268_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v4
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2
; GFX950-NEXT: v_maximum3_f32 v6, v6, v3, v3
; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB268_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB269_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX90A-NEXT: v_min_f32_e32 v8, v0, v4
; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB269_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_a_a:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB269_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX950-NEXT: v_minimum3_f32 v0, v0, v4, v4
; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB269_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=a"()
%result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "a"(<2 x bfloat> %result)
ret void
}
define void @flat_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB270_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
; GFX90A-NEXT: v_min_f32_e32 v8, v5, v2
; GFX90A-NEXT: v_min_f32_e32 v9, v6, v4
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9
; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB270_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use v5
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB270_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v5, v4
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2
; GFX950-NEXT: v_minimum3_f32 v6, v6, v3, v3
; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB270_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
%result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
ret void
}
attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
!0 = !{}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}