| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-EXPAND %s |
| ; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-NOEXPAND %s |
| ; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-EXPAND %s |
| ; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-NOEXPAND %s |
| ; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-EXPAND %s |
| ; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-NOEXPAND %s |
| ; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-EXPAND %s |
| ; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-NOEXPAND %s |
| |
| ; When -amdgpu-expand-waitcnt-profiling is enabled and there are N outstanding |
| ; operations, instead of emitting a single waitcnt(target), we emit: |
| ; waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target) |
| ; |
| ; This allows PC-sampling profilers to identify which specific operation |
| ; is causing a stall by observing where the program counter is stuck. |
| |
| define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, ptr addrspace(4) %ptr_b, ptr addrspace(4) %ptr_c, ptr addrspace(1) %out) #0 { |
| ; GFX9-EXPAND-LABEL: test_lgkmcnt_scalar_loads: |
| ; GFX9-EXPAND: ; %bb.0: |
| ; GFX9-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 |
| ; GFX9-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 |
| ; GFX9-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 |
| ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s2 |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[14:15] |
| ; GFX9-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX9-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: |
| ; GFX9-NOEXPAND: ; %bb.0: |
| ; GFX9-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 |
| ; GFX9-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 |
| ; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] |
| ; GFX9-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-EXPAND-LABEL: test_lgkmcnt_scalar_loads: |
| ; GFX10-EXPAND: ; %bb.0: |
| ; GFX10-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 |
| ; GFX10-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 |
| ; GFX10-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 |
| ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s2 |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[14:15] |
| ; GFX10-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: |
| ; GFX10-NOEXPAND: ; %bb.0: |
| ; GFX10-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 |
| ; GFX10-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 |
| ; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] |
| ; GFX10-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-EXPAND-LABEL: test_lgkmcnt_scalar_loads: |
| ; GFX11-EXPAND: ; %bb.0: |
| ; GFX11-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 |
| ; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 |
| ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s2 |
| ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 |
| ; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7] |
| ; GFX11-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: |
| ; GFX11-NOEXPAND: ; %bb.0: |
| ; GFX11-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 |
| ; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 |
| ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 |
| ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7] |
| ; GFX11-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-EXPAND-LABEL: test_lgkmcnt_scalar_loads: |
| ; GFX12-EXPAND: ; %bb.0: |
| ; GFX12-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 |
| ; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 |
| ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 |
| ; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s2 |
| ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 |
| ; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7] |
| ; GFX12-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: |
| ; GFX12-NOEXPAND: ; %bb.0: |
| ; GFX12-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1 |
| ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s2 |
| ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 |
| ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7] |
| ; GFX12-NOEXPAND-NEXT: s_endpgm |
| |
| %val_a = load i32, ptr addrspace(4) %ptr_a, align 4 |
| %val_b = load i32, ptr addrspace(4) %ptr_b, align 4 |
| %val_c = load i32, ptr addrspace(4) %ptr_c, align 4 |
| %sum1 = add i32 %val_a, %val_b |
| %sum2 = add i32 %sum1, %val_c |
| store i32 %sum2, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr addrspace(1) %out) #0 { |
| ; GFX9-EXPAND-LABEL: test_vmcnt_global_loads: |
| ; GFX9-EXPAND: ; %bb.0: |
| ; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-EXPAND-NEXT: global_load_dword v1, v0, s[0:1] |
| ; GFX9-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 |
| ; GFX9-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 |
| ; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(2) |
| ; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 |
| ; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[2:3] |
| ; GFX9-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX9-NOEXPAND-LABEL: test_vmcnt_global_loads: |
| ; GFX9-NOEXPAND: ; %bb.0: |
| ; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1] |
| ; GFX9-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 |
| ; GFX9-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 |
| ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3] |
| ; GFX9-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-EXPAND-LABEL: test_vmcnt_global_loads: |
| ; GFX10-EXPAND: ; %bb.0: |
| ; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-EXPAND-NEXT: s_clause 0x2 |
| ; GFX10-EXPAND-NEXT: global_load_dword v1, v0, s[0:1] |
| ; GFX10-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 |
| ; GFX10-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 |
| ; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(2) |
| ; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(1) |
| ; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 |
| ; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[2:3] |
| ; GFX10-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-NOEXPAND-LABEL: test_vmcnt_global_loads: |
| ; GFX10-NOEXPAND: ; %bb.0: |
| ; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NOEXPAND-NEXT: s_clause 0x2 |
| ; GFX10-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1] |
| ; GFX10-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 |
| ; GFX10-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 |
| ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3] |
| ; GFX10-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-EXPAND-LABEL: test_vmcnt_global_loads: |
| ; GFX11-EXPAND: ; %bb.0: |
| ; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-EXPAND-NEXT: s_clause 0x2 |
| ; GFX11-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1] |
| ; GFX11-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 |
| ; GFX11-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 |
| ; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(2) |
| ; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 |
| ; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3] |
| ; GFX11-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-NOEXPAND-LABEL: test_vmcnt_global_loads: |
| ; GFX11-NOEXPAND: ; %bb.0: |
| ; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NOEXPAND-NEXT: s_clause 0x2 |
| ; GFX11-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1] |
| ; GFX11-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 |
| ; GFX11-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 |
| ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3] |
| ; GFX11-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-EXPAND-LABEL: test_vmcnt_global_loads: |
| ; GFX12-EXPAND: ; %bb.0: |
| ; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-EXPAND-NEXT: s_clause 0x2 |
| ; GFX12-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1] |
| ; GFX12-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 |
| ; GFX12-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 |
| ; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x2 |
| ; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x1 |
| ; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 |
| ; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3] |
| ; GFX12-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-NOEXPAND-LABEL: test_vmcnt_global_loads: |
| ; GFX12-NOEXPAND: ; %bb.0: |
| ; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_clause 0x2 |
| ; GFX12-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1] |
| ; GFX12-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 |
| ; GFX12-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 |
| ; GFX12-NOEXPAND-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 |
| ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3] |
| ; GFX12-NOEXPAND-NEXT: s_endpgm |
| |
| ; Use thread ID to create thread-varying addresses -> forces vector loads |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid64 = zext i32 %tid to i64 |
| |
| ; Three separate global loads with thread-varying addresses |
| ; Non-volatile loads allow multiple operations to be in-flight |
| %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64 |
| %val0 = load i32, ptr addrspace(1) %ptr0, align 4 |
| |
| %offset1 = add i64 %tid64, 64 |
| %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1 |
| %val1 = load i32, ptr addrspace(1) %ptr1, align 4 |
| |
| %offset2 = add i64 %tid64, 128 |
| %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2 |
| %val2 = load i32, ptr addrspace(1) %ptr2, align 4 |
| |
| %sum1 = add i32 %val0, %val1 |
| %sum2 = add i32 %sum1, %val2 |
| |
| %out_ptr = getelementptr i32, ptr addrspace(1) %out, i64 %tid64 |
| store i32 %sum2, ptr addrspace(1) %out_ptr, align 4 |
| ret void |
| } |
| |
| declare i32 @llvm.amdgcn.workitem.id.x() |
| |
| define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr, ptr addrspace(1) %out) #0 { |
| ; GFX9-EXPAND-LABEL: test_lgkmcnt_lds_operations: |
| ; GFX9-EXPAND: ; %bb.0: |
| ; GFX9-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 |
| ; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v3, 0 |
| ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 |
| ; GFX9-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8 |
| ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) |
| ; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1 |
| ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v2 |
| ; GFX9-EXPAND-NEXT: global_store_dword v3, v0, s[0:1] |
| ; GFX9-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX9-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: |
| ; GFX9-NOEXPAND: ; %bb.0: |
| ; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 |
| ; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v3, 0 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 |
| ; GFX9-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) |
| ; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v2 |
| ; GFX9-NOEXPAND-NEXT: global_store_dword v3, v0, s[0:1] |
| ; GFX9-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-EXPAND-LABEL: test_lgkmcnt_lds_operations: |
| ; GFX10-EXPAND: ; %bb.0: |
| ; GFX10-EXPAND-NEXT: s_clause 0x1 |
| ; GFX10-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 |
| ; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c |
| ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 |
| ; GFX10-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8 |
| ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) |
| ; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 |
| ; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[0:1] |
| ; GFX10-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: |
| ; GFX10-NOEXPAND: ; %bb.0: |
| ; GFX10-NOEXPAND-NEXT: s_clause 0x1 |
| ; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 |
| ; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 |
| ; GFX10-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8 |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) |
| ; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 |
| ; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[0:1] |
| ; GFX10-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-EXPAND-LABEL: test_lgkmcnt_lds_operations: |
| ; GFX11-EXPAND: ; %bb.0: |
| ; GFX11-EXPAND-NEXT: s_clause 0x1 |
| ; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 |
| ; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c |
| ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-EXPAND-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 |
| ; GFX11-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8 |
| ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) |
| ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 |
| ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 |
| ; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1] |
| ; GFX11-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: |
| ; GFX11-NOEXPAND: ; %bb.0: |
| ; GFX11-NOEXPAND-NEXT: s_clause 0x1 |
| ; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 |
| ; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 |
| ; GFX11-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8 |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) |
| ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 |
| ; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1] |
| ; GFX11-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-EXPAND-LABEL: test_lgkmcnt_lds_operations: |
| ; GFX12-EXPAND: ; %bb.0: |
| ; GFX12-EXPAND-NEXT: s_clause 0x1 |
| ; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 |
| ; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c |
| ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-EXPAND-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 |
| ; GFX12-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8 |
| ; GFX12-EXPAND-NEXT: s_wait_dscnt 0x1 |
| ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 |
| ; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 |
| ; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1] |
| ; GFX12-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: |
| ; GFX12-NOEXPAND: ; %bb.0: |
| ; GFX12-NOEXPAND-NEXT: s_clause 0x1 |
| ; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 |
| ; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c |
| ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 |
| ; GFX12-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8 |
| ; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x1 |
| ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 |
| ; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 |
| ; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1] |
| ; GFX12-NOEXPAND-NEXT: s_endpgm |
| |
| %ptr0 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 0 |
| %ptr1 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1 |
| %ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 2 |
| %val0 = load i32, ptr addrspace(3) %ptr0, align 4 |
| %val1 = load i32, ptr addrspace(3) %ptr1, align 4 |
| %val2 = load i32, ptr addrspace(3) %ptr2, align 4 |
| %sum1 = add i32 %val0, %val1 |
| %sum2 = add i32 %sum1, %val2 |
| store i32 %sum2, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ptr_a, ptr addrspace(4) %scalar_ptr_b, ptr addrspace(1) %out) #0 { |
| ; GFX9-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: |
| ; GFX9-EXPAND: ; %bb.0: |
| ; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX9-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 |
| ; GFX9-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 |
| ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-EXPAND-NEXT: s_add_i32 s0, s4, s5 |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[6:7] |
| ; GFX9-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX9-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: |
| ; GFX9-NOEXPAND: ; %bb.0: |
| ; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 |
| ; GFX9-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s4, s5 |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] |
| ; GFX9-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: |
| ; GFX10-EXPAND: ; %bb.0: |
| ; GFX10-EXPAND-NEXT: s_clause 0x1 |
| ; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 |
| ; GFX10-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 |
| ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-EXPAND-NEXT: s_add_i32 s0, s4, s5 |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[6:7] |
| ; GFX10-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: |
| ; GFX10-NOEXPAND: ; %bb.0: |
| ; GFX10-NOEXPAND-NEXT: s_clause 0x1 |
| ; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 |
| ; GFX10-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s4, s5 |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] |
| ; GFX10-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: |
| ; GFX11-EXPAND: ; %bb.0: |
| ; GFX11-EXPAND-NEXT: s_clause 0x1 |
| ; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 |
| ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 |
| ; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5] |
| ; GFX11-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: |
| ; GFX11-NOEXPAND: ; %bb.0: |
| ; GFX11-NOEXPAND-NEXT: s_clause 0x1 |
| ; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 |
| ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5] |
| ; GFX11-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: |
| ; GFX12-EXPAND: ; %bb.0: |
| ; GFX12-EXPAND-NEXT: s_clause 0x1 |
| ; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 |
| ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 |
| ; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 |
| ; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5] |
| ; GFX12-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: |
| ; GFX12-NOEXPAND: ; %bb.0: |
| ; GFX12-NOEXPAND-NEXT: s_clause 0x1 |
| ; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1 |
| ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 |
| ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5] |
| ; GFX12-NOEXPAND-NEXT: s_endpgm |
| |
| %scalar_val1 = load i32, ptr addrspace(4) %scalar_ptr_a, align 4 |
| %scalar_val2 = load i32, ptr addrspace(4) %scalar_ptr_b, align 4 |
| |
| %result = add i32 %scalar_val1, %scalar_val2 |
| store i32 %result, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| ; Test that expansion is NOT applied when counters are out-of-order (mixed event types). |
| ; In pre-GFX12, LDS and SMEM operations both use DS_CNT (lgkmcnt), but they can complete |
| ; out-of-order relative to each other. When both are in-flight, we should NOT expand |
| ; because the expansion would be misleading. |
| define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_ptr, ptr addrspace(4) %smem_ptr, ptr addrspace(1) %out) #0 { |
| ; GFX9-EXPAND-LABEL: test_outoforder_lds_and_smem: |
| ; GFX9-EXPAND: ; %bb.0: |
| ; GFX9-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 |
| ; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, s6 |
| ; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 |
| ; GFX9-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 |
| ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1 |
| ; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, s0, v0 |
| ; GFX9-EXPAND-NEXT: global_store_dword v2, v0, s[2:3] |
| ; GFX9-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX9-NOEXPAND-LABEL: test_outoforder_lds_and_smem: |
| ; GFX9-NOEXPAND: ; %bb.0: |
| ; GFX9-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 |
| ; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 |
| ; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 |
| ; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1 |
| ; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, s0, v0 |
| ; GFX9-NOEXPAND-NEXT: global_store_dword v2, v0, s[2:3] |
| ; GFX9-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-EXPAND-LABEL: test_outoforder_lds_and_smem: |
| ; GFX10-EXPAND: ; %bb.0: |
| ; GFX10-EXPAND-NEXT: s_clause 0x1 |
| ; GFX10-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 |
| ; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c |
| ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, s6 |
| ; GFX10-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 |
| ; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 |
| ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 |
| ; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[2:3] |
| ; GFX10-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-NOEXPAND-LABEL: test_outoforder_lds_and_smem: |
| ; GFX10-NOEXPAND: ; %bb.0: |
| ; GFX10-NOEXPAND-NEXT: s_clause 0x1 |
| ; GFX10-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 |
| ; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 |
| ; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 |
| ; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 |
| ; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[2:3] |
| ; GFX10-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-EXPAND-LABEL: test_outoforder_lds_and_smem: |
| ; GFX11-EXPAND: ; %bb.0: |
| ; GFX11-EXPAND-NEXT: s_clause 0x1 |
| ; GFX11-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 |
| ; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c |
| ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-EXPAND-NEXT: v_mov_b32_e32 v0, s6 |
| ; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 |
| ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 |
| ; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 |
| ; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3] |
| ; GFX11-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-NOEXPAND-LABEL: test_outoforder_lds_and_smem: |
| ; GFX11-NOEXPAND: ; %bb.0: |
| ; GFX11-NOEXPAND-NEXT: s_clause 0x1 |
| ; GFX11-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 |
| ; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 |
| ; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 |
| ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 |
| ; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3] |
| ; GFX11-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-EXPAND-LABEL: test_outoforder_lds_and_smem: |
| ; GFX12-EXPAND: ; %bb.0: |
| ; GFX12-EXPAND-NEXT: s_clause 0x1 |
| ; GFX12-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 |
| ; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c |
| ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-EXPAND-NEXT: v_mov_b32_e32 v0, s6 |
| ; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 |
| ; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 |
| ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 |
| ; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3] |
| ; GFX12-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-NOEXPAND-LABEL: test_outoforder_lds_and_smem: |
| ; GFX12-NOEXPAND: ; %bb.0: |
| ; GFX12-NOEXPAND-NEXT: s_clause 0x1 |
| ; GFX12-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 |
| ; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c |
| ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 |
| ; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 |
| ; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 |
| ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 |
| ; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3] |
| ; GFX12-NOEXPAND-NEXT: s_endpgm |
| |
| %lds_val1 = load i32, ptr addrspace(3) %lds_ptr, align 4 |
| %smem_val = load i32, ptr addrspace(4) %smem_ptr, align 4 |
| %lds_ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1 |
| %lds_val2 = load i32, ptr addrspace(3) %lds_ptr2, align 4 |
| %sum1 = add i32 %lds_val1, %lds_val2 |
| %sum2 = add i32 %sum1, %smem_val |
| store i32 %sum2, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @test_vscnt_global_stores(ptr addrspace(1) %buf) #0 { |
| ; Test vector memory stores (STORE_CNT/vscnt on GFX10-11, storecnt on GFX12+) |
| ; GFX9-EXPAND-LABEL: test_vscnt_global_stores: |
| ; GFX9-EXPAND: ; %bb.0: ; %entry |
| ; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 1 |
| ; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-EXPAND-NEXT: global_store_dword v0, v2, s[0:1] |
| ; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256 |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 3 |
| ; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512 |
| ; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX9-NOEXPAND-LABEL: test_vscnt_global_stores: |
| ; GFX9-NOEXPAND: ; %bb.0: ; %entry |
| ; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 1 |
| ; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] |
| ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256 |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 3 |
| ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512 |
| ; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-EXPAND-LABEL: test_vscnt_global_stores: |
| ; GFX10-EXPAND: ; %bb.0: ; %entry |
| ; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 1 |
| ; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, 2 |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v3, 3 |
| ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256 |
| ; GFX10-EXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512 |
| ; GFX10-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-NOEXPAND-LABEL: test_vscnt_global_stores: |
| ; GFX10-NOEXPAND: ; %bb.0: ; %entry |
| ; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 1 |
| ; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, 2 |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v3, 3 |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256 |
| ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512 |
| ; GFX10-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-EXPAND-LABEL: test_vscnt_global_stores: |
| ; GFX11-EXPAND: ; %bb.0: ; %entry |
| ; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 |
| ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 |
| ; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-EXPAND-NEXT: s_clause 0x2 |
| ; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX11-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 |
| ; GFX11-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 |
| ; GFX11-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-NOEXPAND-LABEL: test_vscnt_global_stores: |
| ; GFX11-NOEXPAND: ; %bb.0: ; %entry |
| ; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 |
| ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 |
| ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NOEXPAND-NEXT: s_clause 0x2 |
| ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 |
| ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 |
| ; GFX11-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-EXPAND-LABEL: test_vscnt_global_stores: |
| ; GFX12-EXPAND: ; %bb.0: ; %entry |
| ; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 |
| ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 |
| ; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-EXPAND-NEXT: s_clause 0x2 |
| ; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX12-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 |
| ; GFX12-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 |
| ; GFX12-EXPAND-NEXT: global_wb scope:SCOPE_SYS |
| ; GFX12-EXPAND-NEXT: s_wait_storecnt 0x0 |
| ; GFX12-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-NOEXPAND-LABEL: test_vscnt_global_stores: |
| ; GFX12-NOEXPAND: ; %bb.0: ; %entry |
| ; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 |
| ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 |
| ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_clause 0x2 |
| ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 |
| ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 |
| ; GFX12-NOEXPAND-NEXT: global_wb scope:SCOPE_SYS |
| ; GFX12-NOEXPAND-NEXT: s_wait_storecnt 0x0 |
| ; GFX12-NOEXPAND-NEXT: s_endpgm |
| entry: |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid64 = zext i32 %tid to i64 |
| |
| ; Issue multiple stores |
| %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64 |
| store i32 1, ptr addrspace(1) %ptr0, align 4 |
| |
| %offset1 = add i64 %tid64, 64 |
| %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1 |
| store i32 2, ptr addrspace(1) %ptr1, align 4 |
| |
| %offset2 = add i64 %tid64, 128 |
| %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2 |
| store i32 3, ptr addrspace(1) %ptr2, align 4 |
| |
| ; Memory fence forces wait for all stores |
| fence release |
| ret void |
| } |
| |
| define amdgpu_ps void @test_expcnt_exports(float %x, float %y, float %z, float %w) #0 { |
| ; Test export operations (EXP_CNT/expcnt) |
| ; GFX9-EXPAND-LABEL: test_expcnt_exports: |
| ; GFX9-EXPAND: ; %bb.0: ; %entry |
| ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 |
| ; GFX9-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 |
| ; GFX9-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 |
| ; GFX9-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 |
| ; GFX9-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done |
| ; GFX9-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX9-NOEXPAND-LABEL: test_expcnt_exports: |
| ; GFX9-NOEXPAND: ; %bb.0: ; %entry |
| ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 |
| ; GFX9-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 |
| ; GFX9-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 |
| ; GFX9-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 |
| ; GFX9-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done |
| ; GFX9-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-EXPAND-LABEL: test_expcnt_exports: |
| ; GFX10-EXPAND: ; %bb.0: ; %entry |
| ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 |
| ; GFX10-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 |
| ; GFX10-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 |
| ; GFX10-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 |
| ; GFX10-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done |
| ; GFX10-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX10-NOEXPAND-LABEL: test_expcnt_exports: |
| ; GFX10-NOEXPAND: ; %bb.0: ; %entry |
| ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 |
| ; GFX10-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 |
| ; GFX10-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 |
| ; GFX10-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 |
| ; GFX10-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done |
| ; GFX10-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-EXPAND-LABEL: test_expcnt_exports: |
| ; GFX11-EXPAND: ; %bb.0: ; %entry |
| ; GFX11-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 |
| ; GFX11-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 |
| ; GFX11-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 |
| ; GFX11-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 |
| ; GFX11-EXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done |
| ; GFX11-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX11-NOEXPAND-LABEL: test_expcnt_exports: |
| ; GFX11-NOEXPAND: ; %bb.0: ; %entry |
| ; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 |
| ; GFX11-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 |
| ; GFX11-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 |
| ; GFX11-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 |
| ; GFX11-NOEXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done |
| ; GFX11-NOEXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-EXPAND-LABEL: test_expcnt_exports: |
| ; GFX12-EXPAND: ; %bb.0: ; %entry |
| ; GFX12-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 |
| ; GFX12-EXPAND-NEXT: export mrt0 v0, v1, v2, v3 |
| ; GFX12-EXPAND-NEXT: export mrt1 v3, v2, v1, v0 |
| ; GFX12-EXPAND-NEXT: export mrt2 v0, v3, v1, v2 |
| ; GFX12-EXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done |
| ; GFX12-EXPAND-NEXT: s_endpgm |
| ; |
| ; GFX12-NOEXPAND-LABEL: test_expcnt_exports: |
| ; GFX12-NOEXPAND: ; %bb.0: ; %entry |
| ; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 |
| ; GFX12-NOEXPAND-NEXT: export mrt0 v0, v1, v2, v3 |
| ; GFX12-NOEXPAND-NEXT: export mrt1 v3, v2, v1, v0 |
| ; GFX12-NOEXPAND-NEXT: export mrt2 v0, v3, v1, v2 |
| ; GFX12-NOEXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done |
| ; GFX12-NOEXPAND-NEXT: s_endpgm |
| entry: |
| ; Multiple MRT exports |
| call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false) |
| call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %w, float %z, float %y, float %x, i1 false, i1 false) |
| call void @llvm.amdgcn.exp.f32(i32 2, i32 15, float %x, float %w, float %y, float %z, i1 false, i1 false) |
| ; Final export with done bit |
| call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 true, i1 false) |
| ret void |
| } |
| |
| declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) |
| |
| attributes #0 = { nounwind ATTRS } |