| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s |
| |
| define amdgpu_cs void @atomic_load_f32x2_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_f32x2_monotonic_agent: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX9-NEXT: ds_write_b32 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_f32x2_monotonic_agent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX10-NEXT: ds_write_b32 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: atomic_load_f32x2_monotonic_agent: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX11-NEXT: ds_store_b32 v1, v0 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: atomic_load_f32x2_monotonic_agent: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX12-NEXT: ds_store_b32 v1, v0 |
| ; GFX12-NEXT: s_endpgm |
| %a0 = load atomic <2 x float>, ptr addrspace(3) %p syncscope("agent") monotonic, align 8 |
| %num1 = extractelement <2 x float> %a0, i32 0 |
| %num2 = extractelement <2 x float> %a0, i32 1 |
| %res = fadd float %num1, %num2 |
| store float %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_f32x2_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_f32x2_seq_cst_agent: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX9-NEXT: ds_write_b32 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_f32x2_seq_cst_agent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX10-NEXT: ds_write_b32 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: atomic_load_f32x2_seq_cst_agent: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX11-NEXT: ds_store_b32 v1, v0 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: atomic_load_f32x2_seq_cst_agent: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: global_inv scope:SCOPE_SE |
| ; GFX12-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX12-NEXT: ds_store_b32 v1, v0 |
| ; GFX12-NEXT: s_endpgm |
| %a0 = load atomic <2 x float>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 8 |
| %num1 = extractelement <2 x float> %a0, i32 0 |
| %num2 = extractelement <2 x float> %a0, i32 1 |
| %res = fadd float %num1, %num2 |
| store float %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_f32x2_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_f32x2_monotonic_wavefront: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX9-NEXT: ds_write_b32 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_f32x2_monotonic_wavefront: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX10-NEXT: ds_write_b32 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: atomic_load_f32x2_monotonic_wavefront: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX11-NEXT: ds_store_b32 v1, v0 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: atomic_load_f32x2_monotonic_wavefront: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX12-NEXT: ds_store_b32 v1, v0 |
| ; GFX12-NEXT: s_endpgm |
| %a0 = load atomic <2 x float>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 8 |
| %num1 = extractelement <2 x float> %a0, i32 0 |
| %num2 = extractelement <2 x float> %a0, i32 1 |
| %res = fadd float %num1, %num2 |
| store float %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_f16x2_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_f16x2_monotonic_agent: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b32 v0, v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_f16x2_monotonic_agent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b32 v0, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_f16x2_monotonic_agent: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_f16x2_monotonic_agent: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_f16x2_monotonic_agent: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_f16x2_monotonic_agent: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a0 = load atomic <2 x half>, ptr addrspace(3) %p syncscope("agent") monotonic, align 4 |
| %num1 = extractelement <2 x half> %a0, i32 0 |
| %num2 = extractelement <2 x half> %a0, i32 1 |
| %res = fadd half %num1, %num2 |
| store half %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_f16x2_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_f16x2_seq_cst_agent: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b32 v0, v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_f16x2_seq_cst_agent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b32 v0, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_f16x2_seq_cst_agent: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: buffer_gl0_inv |
| ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_f16x2_seq_cst_agent: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: buffer_gl0_inv |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_f16x2_seq_cst_agent: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE |
| ; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_f16x2_seq_cst_agent: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a0 = load atomic <2 x half>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 4 |
| %num1 = extractelement <2 x half> %a0, i32 0 |
| %num2 = extractelement <2 x half> %a0, i32 1 |
| %res = fadd half %num1, %num2 |
| store half %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_f16x2_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_f16x2_monotonic_wavefront: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b32 v0, v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_f16x2_monotonic_wavefront: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b32 v0, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_f16x2_monotonic_wavefront: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_f16x2_monotonic_wavefront: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_f16x2_monotonic_wavefront: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_f16x2_monotonic_wavefront: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a0 = load atomic <2 x half>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 4 |
| %num1 = extractelement <2 x half> %a0, i32 0 |
| %num2 = extractelement <2 x half> %a0, i32 1 |
| %res = fadd half %num1, %num2 |
| store half %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_i16x2_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_i16x2_monotonic_agent: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b32 v0, v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_i16x2_monotonic_agent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b32 v0, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX10-NEXT: v_add_nc_u16 v0, v0, v2 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a = load atomic <2 x i16>, ptr addrspace(3) %p syncscope("agent") monotonic, align 4 |
| %e0 = extractelement <2 x i16> %a, i32 0 |
| %e1 = extractelement <2 x i16> %a, i32 1 |
| %sum = add i16 %e0, %e1 |
| store i16 %sum, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_i16x2_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_i16x2_seq_cst_agent: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b32 v0, v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_i16x2_seq_cst_agent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b32 v0, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX10-NEXT: v_add_nc_u16 v0, v0, v2 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_i16x2_seq_cst_agent: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: buffer_gl0_inv |
| ; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_i16x2_seq_cst_agent: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: buffer_gl0_inv |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_i16x2_seq_cst_agent: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE |
| ; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_i16x2_seq_cst_agent: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a = load atomic <2 x i16>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 4 |
| %e0 = extractelement <2 x i16> %a, i32 0 |
| %e1 = extractelement <2 x i16> %a, i32 1 |
| %sum = add i16 %e0, %e1 |
| store i16 %sum, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_i16x2_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_i16x2_monotonic_wavefront: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b32 v0, v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_i16x2_monotonic_wavefront: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b32 v0, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX10-NEXT: v_add_nc_u16 v0, v0, v2 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_wavefront: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_wavefront: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_wavefront: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_wavefront: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a = load atomic <2 x i16>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 4 |
| %e0 = extractelement <2 x i16> %a, i32 0 |
| %e1 = extractelement <2 x i16> %a, i32 1 |
| %sum = add i16 %e0, %e1 |
| store i16 %sum, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_f16x4_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_f16x4_monotonic_agent: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_f16x4_monotonic_agent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX10-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_f16x4_monotonic_agent: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h |
| ; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_f16x4_monotonic_agent: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l |
| ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_f16x4_monotonic_agent: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h |
| ; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_f16x4_monotonic_agent: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l |
| ; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a0 = load atomic <4 x half>, ptr addrspace(3) %p syncscope("agent") monotonic, align 8 |
| %num1 = extractelement <4 x half> %a0, i32 0 |
| %num2 = extractelement <4 x half> %a0, i32 1 |
| %num3 = extractelement <4 x half> %a0, i32 2 |
| %num4 = extractelement <4 x half> %a0, i32 3 |
| %add = fadd half %num1, %num2 |
| %mul = fmul half %num3, %num4 |
| %res = fadd half %add, %mul |
| store half %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_f16x4_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_f16x4_seq_cst_agent: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_f16x4_seq_cst_agent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX10-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_f16x4_seq_cst_agent: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: buffer_gl0_inv |
| ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h |
| ; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_f16x4_seq_cst_agent: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: buffer_gl0_inv |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l |
| ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_f16x4_seq_cst_agent: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE |
| ; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h |
| ; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_f16x4_seq_cst_agent: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l |
| ; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a0 = load atomic <4 x half>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 8 |
| %num1 = extractelement <4 x half> %a0, i32 0 |
| %num2 = extractelement <4 x half> %a0, i32 1 |
| %num3 = extractelement <4 x half> %a0, i32 2 |
| %num4 = extractelement <4 x half> %a0, i32 3 |
| %add = fadd half %num1, %num2 |
| %mul = fmul half %num3, %num4 |
| %res = fadd half %add, %mul |
| store half %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_f16x4_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_f16x4_monotonic_wavefront: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_f16x4_monotonic_wavefront: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_add_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX10-NEXT: v_mul_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_f16x4_monotonic_wavefront: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h |
| ; GFX11-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_f16x4_monotonic_wavefront: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l |
| ; GFX11-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_f16x4_monotonic_wavefront: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v2.l, v2.h |
| ; GFX12-SDAG-NEXT: v_mul_f16_e32 v0.h, v3.l, v3.h |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_f16x4_monotonic_wavefront: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l |
| ; GFX12-GISEL-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a0 = load atomic <4 x half>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 8 |
| %num1 = extractelement <4 x half> %a0, i32 0 |
| %num2 = extractelement <4 x half> %a0, i32 1 |
| %num3 = extractelement <4 x half> %a0, i32 2 |
| %num4 = extractelement <4 x half> %a0, i32 3 |
| %add = fadd half %num1, %num2 |
| %mul = fmul half %num3, %num4 |
| %res = fadd half %add, %mul |
| store half %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_i16x4_monotonic_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_i16x4_monotonic_agent: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 |
| ; GFX9-NEXT: v_add_u16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: v_mad_legacy_u16 v0, v3, v0, v2 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_i16x4_monotonic_agent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX10-NEXT: v_add_nc_u16 v0, v2, v0 |
| ; GFX10-NEXT: v_mad_u16 v0, v3, v4, v0 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_i16x4_monotonic_agent: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_i16x4_monotonic_agent: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l |
| ; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_i16x4_monotonic_agent: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_i16x4_monotonic_agent: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l |
| ; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a0 = load atomic <4 x i16>, ptr addrspace(3) %p syncscope("agent") monotonic, align 8 |
| %num1 = extractelement <4 x i16> %a0, i32 0 |
| %num2 = extractelement <4 x i16> %a0, i32 1 |
| %num3 = extractelement <4 x i16> %a0, i32 2 |
| %num4 = extractelement <4 x i16> %a0, i32 3 |
| %add = add i16 %num1, %num2 |
| %mul = mul i16 %num3, %num4 |
| %res = add i16 %add, %mul |
| store i16 %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_i16x4_seq_cst_agent(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_i16x4_seq_cst_agent: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 |
| ; GFX9-NEXT: v_add_u16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: v_mad_legacy_u16 v0, v3, v0, v2 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_i16x4_seq_cst_agent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX10-NEXT: v_add_nc_u16 v0, v2, v0 |
| ; GFX10-NEXT: v_mad_u16 v0, v3, v4, v0 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_i16x4_seq_cst_agent: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: buffer_gl0_inv |
| ; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_i16x4_seq_cst_agent: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: buffer_gl0_inv |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l |
| ; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_i16x4_seq_cst_agent: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE |
| ; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_i16x4_seq_cst_agent: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l |
| ; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a0 = load atomic <4 x i16>, ptr addrspace(3) %p syncscope("agent") seq_cst, align 8 |
| %num1 = extractelement <4 x i16> %a0, i32 0 |
| %num2 = extractelement <4 x i16> %a0, i32 1 |
| %num3 = extractelement <4 x i16> %a0, i32 2 |
| %num4 = extractelement <4 x i16> %a0, i32 3 |
| %add = add i16 %num1, %num2 |
| %mul = mul i16 %num3, %num4 |
| %res = add i16 %add, %mul |
| store i16 %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_i16x4_monotonic_wavefront(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_i16x4_monotonic_wavefront: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 |
| ; GFX9-NEXT: v_add_u16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: v_mad_legacy_u16 v0, v3, v0, v2 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_i16x4_monotonic_wavefront: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b64 v[2:3], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX10-NEXT: v_add_nc_u16 v0, v2, v0 |
| ; GFX10-NEXT: v_mad_u16 v0, v3, v4, v0 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_i16x4_monotonic_wavefront: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_i16x4_monotonic_wavefront: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l |
| ; GFX11-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_i16x4_monotonic_wavefront: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v2.l, v2.h |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-SDAG-NEXT: v_mad_u16 v0.l, v3.l, v3.h, v0.l |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_i16x4_monotonic_wavefront: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b64 v[2:3], v0 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v2.l, v0.l |
| ; GFX12-GISEL-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %a0 = load atomic <4 x i16>, ptr addrspace(3) %p syncscope("wavefront") monotonic, align 8 |
| %num1 = extractelement <4 x i16> %a0, i32 0 |
| %num2 = extractelement <4 x i16> %a0, i32 1 |
| %num3 = extractelement <4 x i16> %a0, i32 2 |
| %num4 = extractelement <4 x i16> %a0, i32 3 |
| %add = add i16 %num1, %num2 |
| %mul = mul i16 %num3, %num4 |
| %res = add i16 %add, %mul |
| store i16 %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_1(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v0 offset:1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX9-NEXT: ds_write_b32 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b64 v[2:3], v0 offset:1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX10-NEXT: ds_write_b32 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: ds_load_b64 v[2:3], v0 offset:1 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX11-NEXT: ds_store_b32 v1, v0 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_1: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: ds_load_b64 v[2:3], v0 offset:1 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX12-NEXT: ds_store_b32 v1, v0 |
| ; GFX12-NEXT: s_endpgm |
| %gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 1 |
| %a0 = load atomic <2 x float>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8 |
| %num1 = extractelement <2 x float> %a0, i32 0 |
| %num2 = extractelement <2 x float> %a0, i32 1 |
| %res = fadd float %num1, %num2 |
| store float %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_offset_max(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v0 offset:4095 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX9-NEXT: ds_write_b32 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b64 v[2:3], v0 offset:4095 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX10-NEXT: ds_write_b32 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: ds_load_b64 v[2:3], v0 offset:4095 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX11-NEXT: ds_store_b32 v1, v0 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_offset_max: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: ds_load_b64 v[2:3], v0 offset:4095 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GFX12-NEXT: ds_store_b32 v1, v0 |
| ; GFX12-NEXT: s_endpgm |
| %gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 4095 |
| %a0 = load atomic <2 x float>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8 |
| %num1 = extractelement <2 x float> %a0, i32 0 |
| %num2 = extractelement <2 x float> %a0, i32 1 |
| %res = fadd float %num1, %num2 |
| store float %res, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_1(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b32 v0, v0 offset:1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b32 v0, v0 offset:1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX10-NEXT: v_add_nc_u16 v0, v0, v2 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 offset:1 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 offset:1 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 offset:1 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_1: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 offset:1 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 1 |
| %a = load atomic <2 x i16>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8 |
| %e0 = extractelement <2 x i16> %a, i32 0 |
| %e1 = extractelement <2 x i16> %a, i32 1 |
| %sum = add i16 %e0, %e1 |
| store i16 %sum, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_max(ptr addrspace(3) %p, ptr addrspace(3) %out) { |
| ; GFX9-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ds_read_b32 v0, v0 offset:4095 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| ; GFX9-NEXT: ds_write_b16 v1, v0 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: ds_read_b32 v0, v0 offset:4095 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX10-NEXT: v_add_nc_u16 v0, v0, v2 |
| ; GFX10-NEXT: ds_write_b16 v1, v0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: ds_load_b32 v0, v0 offset:4095 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h |
| ; GFX11-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX11-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: ds_load_b32 v0, v0 offset:4095 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l |
| ; GFX11-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX11-GISEL-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: ds_load_b32 v0, v0 offset:4095 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h |
| ; GFX12-SDAG-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: atomic_load_i16x2_monotonic_agent_offset_max: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: ds_load_b32 v0, v0 offset:4095 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l |
| ; GFX12-GISEL-NEXT: ds_store_b16 v1, v0 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| %gep = getelementptr inbounds i8, ptr addrspace(3) %p, i64 4095 |
| %a = load atomic <2 x i16>, ptr addrspace(3) %gep syncscope("agent") monotonic, align 8 |
| %e0 = extractelement <2 x i16> %a, i32 0 |
| %e1 = extractelement <2 x i16> %a, i32 1 |
| %sum = add i16 %e0, %e1 |
| store i16 %sum, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: |
| ; GCN: {{.*}} |
| ; GFX10-GISEL: {{.*}} |
| ; GFX10-SDAG: {{.*}} |
| ; GFX9-GISEL: {{.*}} |
| ; GFX9-SDAG: {{.*}} |