| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s |
| ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10 %s |
| ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s |
| |
| ; Test using saddr addressing mode of global_* flat atomic instructions. |
| |
| ; -------------------------------------------------------------------------------- |
| ; atomicrmw max |
| ; -------------------------------------------------------------------------------- |
| |
| define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_max_saddr_i32_rtn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB0_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_max_saddr_i32_rtn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_max_saddr_i32_rtn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB0_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst |
| %cast.rtn = bitcast i32 %rtn to float |
| ret float %cast.rtn |
| } |
| |
| define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_max_saddr_i32_rtn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB1_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_max_saddr_i32_rtn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_max_saddr_i32_rtn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB1_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst |
| %cast.rtn = bitcast i32 %rtn to float |
| ret float %cast.rtn |
| } |
| |
| define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_max_saddr_i32_nortn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB2_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_max_saddr_i32_nortn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_max_saddr_i32_nortn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB2_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_max_saddr_i32_nortn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB3_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_max_saddr_i32_nortn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_max_saddr_i32_nortn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB3_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_max_saddr_i64_rtn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB4_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_max_saddr_i64_rtn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_max_saddr_i64_rtn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB4_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst |
| %cast.rtn = bitcast i64 %rtn to <2 x float> |
| ret <2 x float> %cast.rtn |
| } |
| |
| define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_max_saddr_i64_rtn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB5_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_max_saddr_i64_rtn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB5_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_max_saddr_i64_rtn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB5_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst |
| %cast.rtn = bitcast i64 %rtn to <2 x float> |
| ret <2 x float> %cast.rtn |
| } |
| |
| define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_max_saddr_i64_nortn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB6_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_max_saddr_i64_nortn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX10-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_max_saddr_i64_nortn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX11-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB6_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_max_saddr_i64_nortn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB7_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_max_saddr_i64_nortn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX10-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_max_saddr_i64_nortn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX11-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst |
| ret void |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; atomicrmw min |
| ; -------------------------------------------------------------------------------- |
| |
| define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_min_saddr_i32_rtn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB8_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_min_saddr_i32_rtn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_min_saddr_i32_rtn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB8_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst |
| %cast.rtn = bitcast i32 %rtn to float |
| ret float %cast.rtn |
| } |
| |
| define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_min_saddr_i32_rtn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB9_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_min_saddr_i32_rtn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_min_saddr_i32_rtn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB9_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst |
| %cast.rtn = bitcast i32 %rtn to float |
| ret float %cast.rtn |
| } |
| |
| define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_min_saddr_i32_nortn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB10_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_min_saddr_i32_nortn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_min_saddr_i32_nortn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB10_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_min_saddr_i32_nortn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB11_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_min_saddr_i32_nortn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_min_saddr_i32_nortn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB11_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_min_saddr_i64_rtn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB12_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_min_saddr_i64_rtn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_min_saddr_i64_rtn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB12_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst |
| %cast.rtn = bitcast i64 %rtn to <2 x float> |
| ret <2 x float> %cast.rtn |
| } |
| |
| define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_min_saddr_i64_rtn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB13_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_min_saddr_i64_rtn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_min_saddr_i64_rtn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB13_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst |
| %cast.rtn = bitcast i64 %rtn to <2 x float> |
| ret <2 x float> %cast.rtn |
| } |
| |
| define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_min_saddr_i64_nortn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB14_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_min_saddr_i64_nortn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX10-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_min_saddr_i64_nortn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX11-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB14_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_min_saddr_i64_nortn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB15_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_min_saddr_i64_nortn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX10-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB15_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_min_saddr_i64_nortn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX11-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB15_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst |
| ret void |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; atomicrmw umax |
| ; -------------------------------------------------------------------------------- |
| |
| define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_umax_saddr_i32_rtn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB16_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_umax_saddr_i32_rtn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_umax_saddr_i32_rtn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB16_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst |
| %cast.rtn = bitcast i32 %rtn to float |
| ret float %cast.rtn |
| } |
| |
| define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB17_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB17_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst |
| %cast.rtn = bitcast i32 %rtn to float |
| ret float %cast.rtn |
| } |
| |
| define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_umax_saddr_i32_nortn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB18_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_umax_saddr_i32_nortn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_umax_saddr_i32_nortn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB19_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB19_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_umax_saddr_i64_rtn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB20_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_umax_saddr_i64_rtn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_umax_saddr_i64_rtn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB20_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst |
| %cast.rtn = bitcast i64 %rtn to <2 x float> |
| ret <2 x float> %cast.rtn |
| } |
| |
| define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB21_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB21_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst |
| %cast.rtn = bitcast i64 %rtn to <2 x float> |
| ret <2 x float> %cast.rtn |
| } |
| |
| define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_umax_saddr_i64_nortn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB22_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_umax_saddr_i64_nortn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX10-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_umax_saddr_i64_nortn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX11-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB22_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB23_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB23_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX10-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX11-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB23_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst |
| ret void |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; atomicrmw umin |
| ; -------------------------------------------------------------------------------- |
| |
| define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_umin_saddr_i32_rtn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB24_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_umin_saddr_i32_rtn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_umin_saddr_i32_rtn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB24_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst |
| %cast.rtn = bitcast i32 %rtn to float |
| ret float %cast.rtn |
| } |
| |
| define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB25_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB25_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst |
| %cast.rtn = bitcast i32 %rtn to float |
| ret float %cast.rtn |
| } |
| |
| define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_umin_saddr_i32_nortn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB26_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_umin_saddr_i32_nortn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_umin_saddr_i32_nortn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB26_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { |
| ; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB27_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v0 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB27_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_umin_saddr_i64_rtn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB28_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_umin_saddr_i64_rtn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB28_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_umin_saddr_i64_rtn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB28_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst |
| %cast.rtn = bitcast i64 %rtn to <2 x float> |
| ret <2 x float> %cast.rtn |
| } |
| |
| define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB29_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB29_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v10, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v9, v3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB29_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst |
| %cast.rtn = bitcast i64 %rtn to <2 x float> |
| ret <2 x float> %cast.rtn |
| } |
| |
| define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_umin_saddr_i64_nortn: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB30_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_umin_saddr_i64_nortn: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX10-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_umin_saddr_i64_nortn: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX11-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB30_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst |
| ret void |
| } |
| |
| define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { |
| ; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX9-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX9-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB31_1 |
| ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX10-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 |
| ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_waitcnt_depctr 0xfffe |
| ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] |
| ; GFX11-NEXT: v_mov_b32_e32 v6, v4 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] |
| ; GFX11-NEXT: s_cbranch_execnz .LBB31_1 |
| ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end |
| ; GFX11-NEXT: s_endpgm |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst |
| ret void |
| } |
| |
| attributes #0 = { argmemonly nounwind willreturn } |