blob: 032072602b5a4e94597658d1335166876ee90d1f [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s
define amdgpu_cs void @atomic_load_f32x2_monotonic_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_f32x2_monotonic_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f32x2_monotonic_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_load_f32x2_monotonic_agent:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f32x2_monotonic_agent:
; GFX12: ; %bb.0:
; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v0, v0, v1
; GFX12-NEXT: global_store_b32 v[2:3], v0, off
; GFX12-NEXT: s_endpgm
%a0 = load atomic <2 x float>, ptr addrspace(1) %p syncscope("agent") monotonic, align 8
%num1 = extractelement <2 x float> %a0, i32 0
%num2 = extractelement <2 x float> %a0, i32 1
%res = fadd float %num1, %num2
store float %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f32x2_seq_cst_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_f32x2_seq_cst_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f32x2_seq_cst_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_load_f32x2_seq_cst_agent:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f32x2_seq_cst_agent:
; GFX12: ; %bb.0:
; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_add_f32_e32 v0, v0, v1
; GFX12-NEXT: global_store_b32 v[2:3], v0, off
; GFX12-NEXT: s_endpgm
%a0 = load atomic <2 x float>, ptr addrspace(1) %p syncscope("agent") seq_cst, align 8
%num1 = extractelement <2 x float> %a0, i32 0
%num2 = extractelement <2 x float> %a0, i32 1
%res = fadd float %num1, %num2
store float %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f32x2_monotonic_wavefront(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_f32x2_monotonic_wavefront:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f32x2_monotonic_wavefront:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_load_f32x2_monotonic_wavefront:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f32x2_monotonic_wavefront:
; GFX12: ; %bb.0:
; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v0, v0, v1
; GFX12-NEXT: global_store_b32 v[2:3], v0, off
; GFX12-NEXT: s_endpgm
%a0 = load atomic <2 x float>, ptr addrspace(1) %p syncscope("wavefront") monotonic, align 8
%num1 = extractelement <2 x float> %a0, i32 0
%num2 = extractelement <2 x float> %a0, i32 1
%res = fadd float %num1, %num2
store float %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x2_monotonic_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off glc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off glc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <2 x half>, ptr addrspace(1) %p syncscope("agent") monotonic, align 4
%num1 = extractelement <2 x half> %a0, i32 0
%num2 = extractelement <2 x half> %a0, i32 1
%res = fadd half %num1, %num2
store half %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x2_seq_cst_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off glc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: buffer_gl1_inv
; GFX11-SDAG-NEXT: buffer_gl0_inv
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off glc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: buffer_gl1_inv
; GFX11-GISEL-NEXT: buffer_gl0_inv
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <2 x half>, ptr addrspace(1) %p syncscope("agent") seq_cst, align 4
%num1 = extractelement <2 x half> %a0, i32 0
%num2 = extractelement <2 x half> %a0, i32 1
%res = fadd half %num1, %num2
store half %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x2_monotonic_wavefront(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <2 x half>, ptr addrspace(1) %p syncscope("wavefront") monotonic, align 4
%num1 = extractelement <2 x half> %a0, i32 0
%num2 = extractelement <2 x half> %a0, i32 1
%res = fadd half %num1, %num2
store half %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x2_monotonic_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off glc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off glc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a = load atomic <2 x i16>, ptr addrspace(1) %p syncscope("agent") monotonic, align 4
%e0 = extractelement <2 x i16> %a, i32 0
%e1 = extractelement <2 x i16> %a, i32 1
%sum = add i16 %e0, %e1
store i16 %sum, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x2_seq_cst_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off glc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: buffer_gl1_inv
; GFX11-SDAG-NEXT: buffer_gl0_inv
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off glc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: buffer_gl1_inv
; GFX11-GISEL-NEXT: buffer_gl0_inv
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a = load atomic <2 x i16>, ptr addrspace(1) %p syncscope("agent") seq_cst, align 4
%e0 = extractelement <2 x i16> %a, i32 0
%e1 = extractelement <2 x i16> %a, i32 1
%sum = add i16 %e0, %e1
store i16 %sum, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x2_monotonic_wavefront(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a = load atomic <2 x i16>, ptr addrspace(1) %p syncscope("wavefront") monotonic, align 4
%e0 = extractelement <2 x i16> %a, i32 0
%e1 = extractelement <2 x i16> %a, i32 1
%sum = add i16 %e0, %e1
store i16 %sum, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x4_monotonic_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_add_f16_e32 v0, v0, v1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_add_f16_e32 v0, v0, v1
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off glc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off glc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l
; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l
; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x half>, ptr addrspace(1) %p syncscope("agent") monotonic, align 8
%num1 = extractelement <4 x half> %a0, i32 0
%num2 = extractelement <4 x half> %a0, i32 1
%num3 = extractelement <4 x half> %a0, i32 2
%num4 = extractelement <4 x half> %a0, i32 3
%add = fadd half %num1, %num2
%mul = fmul half %num3, %num4
%res = fadd half %add, %mul
store half %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x4_seq_cst_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_add_f16_e32 v0, v0, v1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_add_f16_e32 v0, v0, v1
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off glc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: buffer_gl1_inv
; GFX11-SDAG-NEXT: buffer_gl0_inv
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off glc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: buffer_gl1_inv
; GFX11-GISEL-NEXT: buffer_gl0_inv
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l
; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l
; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x half>, ptr addrspace(1) %p syncscope("agent") seq_cst, align 8
%num1 = extractelement <4 x half> %a0, i32 0
%num2 = extractelement <4 x half> %a0, i32 1
%num3 = extractelement <4 x half> %a0, i32 2
%num4 = extractelement <4 x half> %a0, i32 3
%add = fadd half %num1, %num2
%mul = fmul half %num3, %num4
%res = fadd half %add, %mul
store half %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x4_monotonic_wavefront(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_add_f16_e32 v0, v0, v1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_add_f16_e32 v0, v0, v1
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l
; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v1.l, v1.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v4.l
; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v1.l, v5.l
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x half>, ptr addrspace(1) %p syncscope("wavefront") monotonic, align 8
%num1 = extractelement <4 x half> %a0, i32 0
%num2 = extractelement <4 x half> %a0, i32 1
%num3 = extractelement <4 x half> %a0, i32 2
%num4 = extractelement <4 x half> %a0, i32 3
%add = fadd half %num1, %num2
%mul = fmul half %num3, %num4
%res = fadd half %add, %mul
store half %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x4_monotonic_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mad_legacy_u16 v0, v1, v4, v0
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_add_nc_u16 v0, v0, v4
; GFX10-NEXT: v_mad_u16 v0, v1, v5, v0
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off glc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off glc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l
; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l
; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x i16>, ptr addrspace(1) %p syncscope("agent") monotonic, align 8
%num1 = extractelement <4 x i16> %a0, i32 0
%num2 = extractelement <4 x i16> %a0, i32 1
%num3 = extractelement <4 x i16> %a0, i32 2
%num4 = extractelement <4 x i16> %a0, i32 3
%add = add i16 %num1, %num2
%mul = mul i16 %num3, %num4
%res = add i16 %add, %mul
store i16 %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x4_seq_cst_agent(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mad_legacy_u16 v0, v1, v4, v0
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_add_nc_u16 v0, v0, v4
; GFX10-NEXT: v_mad_u16 v0, v1, v5, v0
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off glc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: buffer_gl1_inv
; GFX11-SDAG-NEXT: buffer_gl0_inv
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off glc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: buffer_gl1_inv
; GFX11-GISEL-NEXT: buffer_gl0_inv
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l
; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l
; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x i16>, ptr addrspace(1) %p syncscope("agent") seq_cst, align 8
%num1 = extractelement <4 x i16> %a0, i32 0
%num2 = extractelement <4 x i16> %a0, i32 1
%num3 = extractelement <4 x i16> %a0, i32 2
%num4 = extractelement <4 x i16> %a0, i32 3
%add = add i16 %num1, %num2
%mul = mul i16 %num3, %num4
%res = add i16 %add, %mul
store i16 %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x4_monotonic_wavefront(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mad_legacy_u16 v0, v1, v4, v0
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_add_nc_u16 v0, v0, v4
; GFX10-NEXT: v_mad_u16 v0, v1, v5, v0
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l
; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v1.l, v1.h, v0.l
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l
; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v1.l, v5.l, v0.l
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x i16>, ptr addrspace(1) %p syncscope("wavefront") monotonic, align 8
%num1 = extractelement <4 x i16> %a0, i32 0
%num2 = extractelement <4 x i16> %a0, i32 1
%num3 = extractelement <4 x i16> %a0, i32 2
%num4 = extractelement <4 x i16> %a0, i32 3
%add = add i16 %num1, %num2
%mul = mul i16 %num3, %num4
%res = add i16 %add, %mul
store i16 %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_1(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:1 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_offset_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:1 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:1 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_1:
; GFX12: ; %bb.0:
; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off offset:1 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v0, v0, v1
; GFX12-NEXT: global_store_b32 v[2:3], v0, off
; GFX12-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 1
%a0 = load atomic <2 x float>, ptr addrspace(1) %gep syncscope("agent") monotonic, align 8
%num1 = extractelement <2 x float> %a0, i32 0
%num2 = extractelement <2 x float> %a0, i32 1
%res = fadd float %num1, %num2
store float %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_max(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: atomic_load_f32x2_monotonic_agent_offset_max:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-SDAG-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:2047 glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-SDAG-NEXT: global_store_dword v[2:3], v0, off
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: atomic_load_f32x2_monotonic_agent_offset_max:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: global_store_dword v[2:3], v0, off
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_max:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4095 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_max:
; GFX12: ; %bb.0:
; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4095 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v0, v0, v1
; GFX12-NEXT: global_store_b32 v[2:3], v0, off
; GFX12-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4095
%a0 = load atomic <2 x float>, ptr addrspace(1) %gep syncscope("agent") monotonic, align 8
%num1 = extractelement <2 x float> %a0, i32 0
%num2 = extractelement <2 x float> %a0, i32 1
%res = fadd float %num1, %num2
store float %res, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_1(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:1 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:1 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off offset:1 glc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off offset:1 glc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off offset:1 scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off offset:1 scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 1
%a = load atomic <2 x i16>, ptr addrspace(1) %gep syncscope("agent") monotonic, align 8
%e0 = extractelement <2 x i16> %a, i32 0
%e1 = extractelement <2 x i16> %a, i32 1
%sum = add i16 %e0, %e1
store i16 %sum, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_max(ptr addrspace(1) %p, ptr addrspace(1) %out) {
; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-SDAG-NEXT: global_load_dword v0, v[0:1], off offset:2047 glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-SDAG-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-SDAG-NEXT: global_store_short v[2:3], v0, off
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-GISEL-NEXT: global_load_dword v0, v[0:1], off glc dlc
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-GISEL-NEXT: global_store_short v[2:3], v0, off
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off offset:4095 glc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: global_load_b32 v0, v[0:1], off offset:4095 glc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off offset:4095 scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off offset:4095 scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX12-GISEL-NEXT: global_store_b16 v[2:3], v0, off
; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4095
%a = load atomic <2 x i16>, ptr addrspace(1) %gep syncscope("agent") monotonic, align 8
%e0 = extractelement <2 x i16> %a, i32 0
%e1 = extractelement <2 x i16> %a, i32 1
%sum = add i16 %e0, %e1
store i16 %sum, ptr addrspace(1) %out, align 4
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
; GFX9-GISEL: {{.*}}
; GFX9-SDAG: {{.*}}