blob: 45432958de819aeaaa1844bd7d9217593d03cec3 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s
define amdgpu_cs void @atomic_load_f32x2_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_f32x2_monotonic_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b64 v[2:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v2, v3
; GFX9-NEXT: ds_write_b32 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f32x2_monotonic_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_f32_e32 v0, v2, v3
; GFX10-NEXT: ds_write_b32 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_load_f32x2_monotonic_agent:
; GFX11: ; %bb.0:
; GFX11-NEXT: ds_load_b64 v[2:3], v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v0, v2, v3
; GFX11-NEXT: ds_store_b32 v1, v0
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f32x2_monotonic_agent:
; GFX12: ; %bb.0:
; GFX12-NEXT: ds_load_b64 v[2:3], v0
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_add_f32_e32 v0, v2, v3
; GFX12-NEXT: ds_store_b32 v1, v0
; GFX12-NEXT: s_endpgm
%a0 = load atomic <2 x float>, ptr addrspace(3) %p syncscope("agent") monotonic, align 8
%num1 = extractelement <2 x float> %a0, i32 0
%num2 = extractelement <2 x float> %a0, i32 1
%res = fadd float %num1, %num2
store float %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f32x2_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_f32x2_seq_cst_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b64 v[2:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v2, v3
; GFX9-NEXT: ds_write_b32 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f32x2_seq_cst_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_add_f32_e32 v0, v2, v3
; GFX10-NEXT: ds_write_b32 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_load_f32x2_seq_cst_agent:
; GFX11: ; %bb.0:
; GFX11-NEXT: ds_load_b64 v[2:3], v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_add_f32_e32 v0, v2, v3
; GFX11-NEXT: ds_store_b32 v1, v0
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f32x2_seq_cst_agent:
; GFX12: ; %bb.0:
; GFX12-NEXT: ds_load_b64 v[2:3], v0
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_add_f32_e32 v0, v2, v3
; GFX12-NEXT: ds_store_b32 v1, v0
; GFX12-NEXT: s_endpgm
%a0 = load atomic <2 x float>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 8
%num1 = extractelement <2 x float> %a0, i32 0
%num2 = extractelement <2 x float> %a0, i32 1
%res = fadd float %num1, %num2
store float %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f32x2_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_f32x2_monotonic_wavefront:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b64 v[2:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v2, v3
; GFX9-NEXT: ds_write_b32 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f32x2_monotonic_wavefront:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_f32_e32 v0, v2, v3
; GFX10-NEXT: ds_write_b32 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_load_f32x2_monotonic_wavefront:
; GFX11: ; %bb.0:
; GFX11-NEXT: ds_load_b64 v[2:3], v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v0, v2, v3
; GFX11-NEXT: ds_store_b32 v1, v0
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f32x2_monotonic_wavefront:
; GFX12: ; %bb.0:
; GFX12-NEXT: ds_load_b64 v[2:3], v0
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_add_f32_e32 v0, v2, v3
; GFX12-NEXT: ds_store_b32 v1, v0
; GFX12-NEXT: s_endpgm
%a0 = load atomic <2 x float>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 8
%num1 = extractelement <2 x float> %a0, i32 0
%num2 = extractelement <2 x float> %a0, i32 1
%res = fadd float %num1, %num2
store float %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x2_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b32 v0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b32 v0, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b32 v0, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b32 v0, v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x2_monotonic_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b32 v0, v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <2 x half>, ptr addrspace(3) %p syncscope("agent") monotonic, align 4
%num1 = extractelement <2 x half> %a0, i32 0
%num2 = extractelement <2 x half> %a0, i32 1
%res = fadd half %num1, %num2
store half %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x2_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b32 v0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b32 v0, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: buffer_gl0_inv
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b32 v0, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: buffer_gl0_inv
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b32 v0, v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x2_seq_cst_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b32 v0, v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <2 x half>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 4
%num1 = extractelement <2 x half> %a0, i32 0
%num2 = extractelement <2 x half> %a0, i32 1
%res = fadd half %num1, %num2
store half %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x2_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b32 v0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b32 v0, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b32 v0, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b32 v0, v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x2_monotonic_wavefront:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b32 v0, v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <2 x half>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 4
%num1 = extractelement <2 x half> %a0, i32 0
%num2 = extractelement <2 x half> %a0, i32 1
%res = fadd half %num1, %num2
store half %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x2_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b32 v0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b32 v0, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_add_nc_u16 v0, v0, v2
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b32 v0, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b32 v0, v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b32 v0, v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a = load atomic <2 x i16>, ptr addrspace(3) %p syncscope("agent") monotonic, align 4
%e0 = extractelement <2 x i16> %a, i32 0
%e1 = extractelement <2 x i16> %a, i32 1
%sum = add i16 %e0, %e1
store i16 %sum, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x2_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b32 v0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b32 v0, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_add_nc_u16 v0, v0, v2
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: buffer_gl0_inv
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b32 v0, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: buffer_gl0_inv
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b32 v0, v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x2_seq_cst_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b32 v0, v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a = load atomic <2 x i16>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 4
%e0 = extractelement <2 x i16> %a, i32 0
%e1 = extractelement <2 x i16> %a, i32 1
%sum = add i16 %e0, %e1
store i16 %sum, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x2_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b32 v0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b32 v0, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_add_nc_u16 v0, v0, v2
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b32 v0, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b32 v0, v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_wavefront:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b32 v0, v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a = load atomic <2 x i16>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 4
%e0 = extractelement <2 x i16> %a, i32 0
%e1 = extractelement <2 x i16> %a, i32 1
%sum = add i16 %e0, %e1
store i16 %sum, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x4_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b64 v[2:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_add_f16_e32 v0, v0, v2
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h
; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l
; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h
; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x4_monotonic_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l
; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x half>, ptr addrspace(3) %p syncscope("agent") monotonic, align 8
%num1 = extractelement <4 x half> %a0, i32 0
%num2 = extractelement <4 x half> %a0, i32 1
%num3 = extractelement <4 x half> %a0, i32 2
%num4 = extractelement <4 x half> %a0, i32 3
%add = fadd half %num1, %num2
%mul = fmul half %num3, %num4
%res = fadd half %add, %mul
store half %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x4_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b64 v[2:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_add_f16_e32 v0, v0, v2
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: buffer_gl0_inv
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h
; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: buffer_gl0_inv
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l
; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h
; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x4_seq_cst_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l
; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x half>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 8
%num1 = extractelement <4 x half> %a0, i32 0
%num2 = extractelement <4 x half> %a0, i32 1
%num3 = extractelement <4 x half> %a0, i32 2
%num4 = extractelement <4 x half> %a0, i32 3
%add = fadd half %num1, %num2
%mul = fmul half %num3, %num4
%res = fadd half %add, %mul
store half %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f16x4_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b64 v[2:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_add_f16_e32 v0, v0, v2
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h
; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l
; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h
; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_f16x4_monotonic_wavefront:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l
; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x half>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 8
%num1 = extractelement <4 x half> %a0, i32 0
%num2 = extractelement <4 x half> %a0, i32 1
%num3 = extractelement <4 x half> %a0, i32 2
%num4 = extractelement <4 x half> %a0, i32 3
%add = fadd half %num1, %num2
%mul = fmul half %num3, %num4
%res = fadd half %add, %mul
store half %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x4_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b64 v[2:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX9-NEXT: v_add_u16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mad_legacy_u16 v0, v3, v0, v2
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_add_nc_u16 v0, v2, v0
; GFX10-NEXT: v_mad_u16 v0, v3, v4, v0
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l
; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x4_monotonic_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l
; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x i16>, ptr addrspace(3) %p syncscope("agent") monotonic, align 8
%num1 = extractelement <4 x i16> %a0, i32 0
%num2 = extractelement <4 x i16> %a0, i32 1
%num3 = extractelement <4 x i16> %a0, i32 2
%num4 = extractelement <4 x i16> %a0, i32 3
%add = add i16 %num1, %num2
%mul = mul i16 %num3, %num4
%res = add i16 %add, %mul
store i16 %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x4_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b64 v[2:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX9-NEXT: v_add_u16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mad_legacy_u16 v0, v3, v0, v2
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_add_nc_u16 v0, v2, v0
; GFX10-NEXT: v_mad_u16 v0, v3, v4, v0
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: buffer_gl0_inv
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: buffer_gl0_inv
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l
; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x4_seq_cst_agent:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l
; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x i16>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 8
%num1 = extractelement <4 x i16> %a0, i32 0
%num2 = extractelement <4 x i16> %a0, i32 1
%num3 = extractelement <4 x i16> %a0, i32 2
%num4 = extractelement <4 x i16> %a0, i32 3
%add = add i16 %num1, %num2
%mul = mul i16 %num3, %num4
%res = add i16 %add, %mul
store i16 %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x4_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b64 v[2:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX9-NEXT: v_add_u16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_mad_legacy_u16 v0, v3, v0, v2
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_add_nc_u16 v0, v2, v0
; GFX10-NEXT: v_mad_u16 v0, v3, v4, v0
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l
; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x4_monotonic_wavefront:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l
; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%a0 = load atomic <4 x i16>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 8
%num1 = extractelement <4 x i16> %a0, i32 0
%num2 = extractelement <4 x i16> %a0, i32 1
%num3 = extractelement <4 x i16> %a0, i32 2
%num4 = extractelement <4 x i16> %a0, i32 3
%add = add i16 %num1, %num2
%mul = mul i16 %num3, %num4
%res = add i16 %add, %mul
store i16 %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_1(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b64 v[2:3], v0 offset:1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v2, v3
; GFX9-NEXT: ds_write_b32 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_offset_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b64 v[2:3], v0 offset:1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_f32_e32 v0, v2, v3
; GFX10-NEXT: ds_write_b32 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: ds_load_b64 v[2:3], v0 offset:1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v0, v2, v3
; GFX11-NEXT: ds_store_b32 v1, v0
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_1:
; GFX12: ; %bb.0:
; GFX12-NEXT: ds_load_b64 v[2:3], v0 offset:1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_add_f32_e32 v0, v2, v3
; GFX12-NEXT: ds_store_b32 v1, v0
; GFX12-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 1
%a0 = load atomic <2 x float>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8
%num1 = extractelement <2 x float> %a0, i32 0
%num2 = extractelement <2 x float> %a0, i32 1
%res = fadd float %num1, %num2
store float %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_max(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b64 v[2:3], v0 offset:4095
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v0, v2, v3
; GFX9-NEXT: ds_write_b32 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_offset_max:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b64 v[2:3], v0 offset:4095
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_f32_e32 v0, v2, v3
; GFX10-NEXT: ds_write_b32 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_max:
; GFX11: ; %bb.0:
; GFX11-NEXT: ds_load_b64 v[2:3], v0 offset:4095
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v0, v2, v3
; GFX11-NEXT: ds_store_b32 v1, v0
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_max:
; GFX12: ; %bb.0:
; GFX12-NEXT: ds_load_b64 v[2:3], v0 offset:4095
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_add_f32_e32 v0, v2, v3
; GFX12-NEXT: ds_store_b32 v1, v0
; GFX12-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 4095
%a0 = load atomic <2 x float>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8
%num1 = extractelement <2 x float> %a0, i32 0
%num2 = extractelement <2 x float> %a0, i32 1
%res = fadd float %num1, %num2
store float %res, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_1(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b32 v0, v0 offset:1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b32 v0, v0 offset:1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_add_nc_u16 v0, v0, v2
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 offset:1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 offset:1
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 offset:1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_1:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 offset:1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 1
%a = load atomic <2 x i16>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8
%e0 = extractelement <2 x i16> %a, i32 0
%e1 = extractelement <2 x i16> %a, i32 1
%sum = add i16 %e0, %e1
store i16 %sum, ptr addrspace(3) %out, align 4
ret void
}
define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_max(ptr addrspace(3) %p, ptr addrspace(3) %out) {
; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: ds_read_b32 v0, v0 offset:4095
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: ds_write_b16 v1, v0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX10: ; %bb.0:
; GFX10-NEXT: ds_read_b32 v0, v0 offset:4095
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_add_nc_u16 v0, v0, v2
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 offset:4095
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-NEXT: ds_store_b16 v1, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 offset:4095
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
; GFX11-GISEL-NEXT: ds_store_b16 v1, v0
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 offset:4095
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-NEXT: ds_store_b16 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 offset:4095
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
; GFX12-GISEL-NEXT: ds_store_b16 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 4095
%a = load atomic <2 x i16>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8
%e0 = extractelement <2 x i16> %a, i32 0
%e1 = extractelement <2 x i16> %a, i32 1
%sum = add i16 %e0, %e1
store i16 %sum, ptr addrspace(3) %out, align 4
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
; GFX10-GISEL: {{.*}}
; GFX10-SDAG: {{.*}}
; GFX9-GISEL: {{.*}}
; GFX9-SDAG: {{.*}}