blob: 848a9d07084ed85d4ba58bcad8be480d8a3dba66 [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-EXPAND %s
; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-NOEXPAND %s
; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-EXPAND %s
; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-NOEXPAND %s
; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-EXPAND %s
; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-NOEXPAND %s
; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-EXPAND %s
; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-NOEXPAND %s
; When -amdgpu-expand-waitcnt-profiling is enabled and there are N outstanding
; operations, instead of emitting a single waitcnt(target), we emit:
; waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target)
;
; This allows PC-sampling profilers to identify which specific operation
; is causing a stall by observing where the program counter is stuck.
define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, ptr addrspace(4) %ptr_b, ptr addrspace(4) %ptr_c, ptr addrspace(1) %out) #0 {
; GFX9-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
; GFX9-EXPAND: ; %bb.0:
; GFX9-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX9-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX9-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s2
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0
; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[14:15]
; GFX9-EXPAND-NEXT: s_endpgm
;
; GFX9-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
; GFX9-NOEXPAND: ; %bb.0:
; GFX9-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX9-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s2
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15]
; GFX9-NOEXPAND-NEXT: s_endpgm
;
; GFX10-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
; GFX10-EXPAND: ; %bb.0:
; GFX10-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX10-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX10-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s2
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0
; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[14:15]
; GFX10-EXPAND-NEXT: s_endpgm
;
; GFX10-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
; GFX10-NOEXPAND: ; %bb.0:
; GFX10-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX10-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s2
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15]
; GFX10-NOEXPAND-NEXT: s_endpgm
;
; GFX11-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
; GFX11-EXPAND: ; %bb.0:
; GFX11-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s2
; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
; GFX11-EXPAND-NEXT: s_endpgm
;
; GFX11-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
; GFX11-NOEXPAND: ; %bb.0:
; GFX11-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s2
; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
; GFX11-NOEXPAND-NEXT: s_endpgm
;
; GFX12-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
; GFX12-EXPAND: ; %bb.0:
; GFX12-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s2
; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
; GFX12-EXPAND-NEXT: s_endpgm
;
; GFX12-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
; GFX12-NOEXPAND: ; %bb.0:
; GFX12-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s2
; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
; GFX12-NOEXPAND-NEXT: s_endpgm
%val_a = load i32, ptr addrspace(4) %ptr_a, align 4
%val_b = load i32, ptr addrspace(4) %ptr_b, align 4
%val_c = load i32, ptr addrspace(4) %ptr_c, align 4
%sum1 = add i32 %val_a, %val_b
%sum2 = add i32 %sum1, %val_c
store i32 %sum2, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr addrspace(1) %out) #0 {
; GFX9-EXPAND-LABEL: test_vmcnt_global_loads:
; GFX9-EXPAND: ; %bb.0:
; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
; GFX9-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(2)
; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(1)
; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0)
; GFX9-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-EXPAND-NEXT: s_endpgm
;
; GFX9-NOEXPAND-LABEL: test_vmcnt_global_loads:
; GFX9-NOEXPAND: ; %bb.0:
; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
; GFX9-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
; GFX9-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NOEXPAND-NEXT: s_endpgm
;
; GFX10-EXPAND-LABEL: test_vmcnt_global_loads:
; GFX10-EXPAND: ; %bb.0:
; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: s_clause 0x2
; GFX10-EXPAND-NEXT: global_load_dword v1, v0, s[0:1]
; GFX10-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
; GFX10-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(2)
; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(1)
; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(0)
; GFX10-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-EXPAND-NEXT: s_endpgm
;
; GFX10-NOEXPAND-LABEL: test_vmcnt_global_loads:
; GFX10-NOEXPAND: ; %bb.0:
; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NOEXPAND-NEXT: s_clause 0x2
; GFX10-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1]
; GFX10-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
; GFX10-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
; GFX10-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
; GFX10-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NOEXPAND-NEXT: s_endpgm
;
; GFX11-EXPAND-LABEL: test_vmcnt_global_loads:
; GFX11-EXPAND: ; %bb.0:
; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: s_clause 0x2
; GFX11-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
; GFX11-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(2)
; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(1)
; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(0)
; GFX11-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-EXPAND-NEXT: s_endpgm
;
; GFX11-NOEXPAND-LABEL: test_vmcnt_global_loads:
; GFX11-NOEXPAND: ; %bb.0:
; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOEXPAND-NEXT: s_clause 0x2
; GFX11-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
; GFX11-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
; GFX11-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NOEXPAND-NEXT: s_endpgm
;
; GFX12-EXPAND-LABEL: test_vmcnt_global_loads:
; GFX12-EXPAND: ; %bb.0:
; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: s_clause 0x2
; GFX12-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX12-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
; GFX12-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x2
; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x1
; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x0
; GFX12-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-EXPAND-NEXT: s_endpgm
;
; GFX12-NOEXPAND-LABEL: test_vmcnt_global_loads:
; GFX12-NOEXPAND: ; %bb.0:
; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-NOEXPAND-NEXT: s_clause 0x2
; GFX12-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX12-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
; GFX12-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
; GFX12-NOEXPAND-NEXT: s_wait_loadcnt 0x0
; GFX12-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NOEXPAND-NEXT: s_endpgm
; Use thread ID to create thread-varying addresses -> forces vector loads
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid64 = zext i32 %tid to i64
; Three separate global loads with thread-varying addresses
; Non-volatile loads allow multiple operations to be in-flight
%ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64
%val0 = load i32, ptr addrspace(1) %ptr0, align 4
%offset1 = add i64 %tid64, 64
%ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1
%val1 = load i32, ptr addrspace(1) %ptr1, align 4
%offset2 = add i64 %tid64, 128
%ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2
%val2 = load i32, ptr addrspace(1) %ptr2, align 4
%sum1 = add i32 %val0, %val1
%sum2 = add i32 %sum1, %val2
%out_ptr = getelementptr i32, ptr addrspace(1) %out, i64 %tid64
store i32 %sum2, ptr addrspace(1) %out_ptr, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr, ptr addrspace(1) %out) #0 {
; GFX9-EXPAND-LABEL: test_lgkmcnt_lds_operations:
; GFX9-EXPAND: ; %bb.0:
; GFX9-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v3, 0
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; GFX9-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-EXPAND-NEXT: global_store_dword v3, v0, s[0:1]
; GFX9-EXPAND-NEXT: s_endpgm
;
; GFX9-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
; GFX9-NOEXPAND: ; %bb.0:
; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; GFX9-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8
; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NOEXPAND-NEXT: global_store_dword v3, v0, s[0:1]
; GFX9-NOEXPAND-NEXT: s_endpgm
;
; GFX10-EXPAND-LABEL: test_lgkmcnt_lds_operations:
; GFX10-EXPAND: ; %bb.0:
; GFX10-EXPAND-NEXT: s_clause 0x1
; GFX10-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; GFX10-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-EXPAND-NEXT: s_endpgm
;
; GFX10-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
; GFX10-NOEXPAND: ; %bb.0:
; GFX10-NOEXPAND-NEXT: s_clause 0x1
; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; GFX10-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8
; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NOEXPAND-NEXT: s_endpgm
;
; GFX11-EXPAND-LABEL: test_lgkmcnt_lds_operations:
; GFX11-EXPAND: ; %bb.0:
; GFX11-EXPAND-NEXT: s_clause 0x1
; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
; GFX11-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-EXPAND-NEXT: s_endpgm
;
; GFX11-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
; GFX11-NOEXPAND: ; %bb.0:
; GFX11-NOEXPAND-NEXT: s_clause 0x1
; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
; GFX11-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8
; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NOEXPAND-NEXT: s_endpgm
;
; GFX12-EXPAND-LABEL: test_lgkmcnt_lds_operations:
; GFX12-EXPAND: ; %bb.0:
; GFX12-EXPAND-NEXT: s_clause 0x1
; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
; GFX12-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8
; GFX12-EXPAND-NEXT: s_wait_dscnt 0x1
; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0
; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-EXPAND-NEXT: s_endpgm
;
; GFX12-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
; GFX12-NOEXPAND: ; %bb.0:
; GFX12-NOEXPAND-NEXT: s_clause 0x1
; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
; GFX12-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8
; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x1
; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0
; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NOEXPAND-NEXT: s_endpgm
%ptr0 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 0
%ptr1 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1
%ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 2
%val0 = load i32, ptr addrspace(3) %ptr0, align 4
%val1 = load i32, ptr addrspace(3) %ptr1, align 4
%val2 = load i32, ptr addrspace(3) %ptr2, align 4
%sum1 = add i32 %val0, %val1
%sum2 = add i32 %sum1, %val2
store i32 %sum2, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ptr_a, ptr addrspace(4) %scalar_ptr_b, ptr addrspace(1) %out) #0 {
; GFX9-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
; GFX9-EXPAND: ; %bb.0:
; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: s_add_i32 s0, s4, s5
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0
; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-EXPAND-NEXT: s_endpgm
;
; GFX9-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
; GFX9-NOEXPAND: ; %bb.0:
; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s4, s5
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NOEXPAND-NEXT: s_endpgm
;
; GFX10-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
; GFX10-EXPAND: ; %bb.0:
; GFX10-EXPAND-NEXT: s_clause 0x1
; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: s_add_i32 s0, s4, s5
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0
; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-EXPAND-NEXT: s_endpgm
;
; GFX10-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
; GFX10-NOEXPAND: ; %bb.0:
; GFX10-NOEXPAND-NEXT: s_clause 0x1
; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s4, s5
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-NOEXPAND-NEXT: s_endpgm
;
; GFX11-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
; GFX11-EXPAND: ; %bb.0:
; GFX11-EXPAND-NEXT: s_clause 0x1
; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-EXPAND-NEXT: s_endpgm
;
; GFX11-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
; GFX11-NOEXPAND: ; %bb.0:
; GFX11-NOEXPAND-NEXT: s_clause 0x1
; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NOEXPAND-NEXT: s_endpgm
;
; GFX12-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
; GFX12-EXPAND: ; %bb.0:
; GFX12-EXPAND-NEXT: s_clause 0x1
; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-EXPAND-NEXT: s_endpgm
;
; GFX12-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
; GFX12-NOEXPAND: ; %bb.0:
; GFX12-NOEXPAND-NEXT: s_clause 0x1
; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NOEXPAND-NEXT: s_endpgm
%scalar_val1 = load i32, ptr addrspace(4) %scalar_ptr_a, align 4
%scalar_val2 = load i32, ptr addrspace(4) %scalar_ptr_b, align 4
%result = add i32 %scalar_val1, %scalar_val2
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
; Test that expansion is NOT applied when counters are out-of-order (mixed event types).
; In pre-GFX12, LDS and SMEM operations both use DS_CNT (lgkmcnt), but they can complete
; out-of-order relative to each other. When both are in-flight, we should NOT expand
; because the expansion would be misleading.
define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_ptr, ptr addrspace(4) %smem_ptr, ptr addrspace(1) %out) #0 {
; GFX9-EXPAND-LABEL: test_outoforder_lds_and_smem:
; GFX9-EXPAND: ; %bb.0:
; GFX9-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 0
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, s6
; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-EXPAND-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-EXPAND-NEXT: s_endpgm
;
; GFX9-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
; GFX9-NOEXPAND: ; %bb.0:
; GFX9-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-NOEXPAND-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NOEXPAND-NEXT: s_endpgm
;
; GFX10-EXPAND-LABEL: test_outoforder_lds_and_smem:
; GFX10-EXPAND: ; %bb.0:
; GFX10-EXPAND-NEXT: s_clause 0x1
; GFX10-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, s6
; GFX10-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0
; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-EXPAND-NEXT: s_endpgm
;
; GFX10-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
; GFX10-NOEXPAND: ; %bb.0:
; GFX10-NOEXPAND-NEXT: s_clause 0x1
; GFX10-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NOEXPAND-NEXT: s_endpgm
;
; GFX11-EXPAND-LABEL: test_outoforder_lds_and_smem:
; GFX11-EXPAND: ; %bb.0:
; GFX11-EXPAND-NEXT: s_clause 0x1
; GFX11-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: v_mov_b32_e32 v0, s6
; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-EXPAND-NEXT: s_endpgm
;
; GFX11-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
; GFX11-NOEXPAND: ; %bb.0:
; GFX11-NOEXPAND-NEXT: s_clause 0x1
; GFX11-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NOEXPAND-NEXT: s_endpgm
;
; GFX12-EXPAND-LABEL: test_outoforder_lds_and_smem:
; GFX12-EXPAND: ; %bb.0:
; GFX12-EXPAND-NEXT: s_clause 0x1
; GFX12-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: v_mov_b32_e32 v0, s6
; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0
; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-EXPAND-NEXT: s_endpgm
;
; GFX12-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
; GFX12-NOEXPAND: ; %bb.0:
; GFX12-NOEXPAND-NEXT: s_clause 0x1
; GFX12-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0
; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-NOEXPAND-NEXT: s_endpgm
%lds_val1 = load i32, ptr addrspace(3) %lds_ptr, align 4
%smem_val = load i32, ptr addrspace(4) %smem_ptr, align 4
%lds_ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1
%lds_val2 = load i32, ptr addrspace(3) %lds_ptr2, align 4
%sum1 = add i32 %lds_val1, %lds_val2
%sum2 = add i32 %sum1, %smem_val
store i32 %sum2, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @test_vscnt_global_stores(ptr addrspace(1) %buf) #0 {
; Test vector memory stores (STORE_CNT/vscnt on GFX10-11, storecnt on GFX12+)
; GFX9-EXPAND-LABEL: test_vscnt_global_stores:
; GFX9-EXPAND: ; %bb.0: ; %entry
; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 2
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 1
; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-EXPAND-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 3
; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512
; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0)
; GFX9-EXPAND-NEXT: s_endpgm
;
; GFX9-NOEXPAND-LABEL: test_vscnt_global_stores:
; GFX9-NOEXPAND: ; %bb.0: ; %entry
; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 2
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 1
; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 3
; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512
; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
; GFX9-NOEXPAND-NEXT: s_endpgm
;
; GFX10-EXPAND-LABEL: test_vscnt_global_stores:
; GFX10-EXPAND: ; %bb.0: ; %entry
; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 1
; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, 2
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v3, 3
; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256
; GFX10-EXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512
; GFX10-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-EXPAND-NEXT: s_endpgm
;
; GFX10-NOEXPAND-LABEL: test_vscnt_global_stores:
; GFX10-NOEXPAND: ; %bb.0: ; %entry
; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 1
; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, 2
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v3, 3
; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256
; GFX10-NOEXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512
; GFX10-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NOEXPAND-NEXT: s_endpgm
;
; GFX11-EXPAND-LABEL: test_vscnt_global_stores:
; GFX11-EXPAND: ; %bb.0: ; %entry
; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-EXPAND-NEXT: s_clause 0x2
; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
; GFX11-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
; GFX11-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-EXPAND-NEXT: s_endpgm
;
; GFX11-NOEXPAND-LABEL: test_vscnt_global_stores:
; GFX11-NOEXPAND: ; %bb.0: ; %entry
; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOEXPAND-NEXT: s_clause 0x2
; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
; GFX11-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NOEXPAND-NEXT: s_endpgm
;
; GFX12-EXPAND-LABEL: test_vscnt_global_stores:
; GFX12-EXPAND: ; %bb.0: ; %entry
; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-EXPAND-NEXT: s_clause 0x2
; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
; GFX12-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
; GFX12-EXPAND-NEXT: global_wb scope:SCOPE_SYS
; GFX12-EXPAND-NEXT: s_wait_storecnt 0x0
; GFX12-EXPAND-NEXT: s_endpgm
;
; GFX12-NOEXPAND-LABEL: test_vscnt_global_stores:
; GFX12-NOEXPAND: ; %bb.0: ; %entry
; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
; GFX12-NOEXPAND-NEXT: s_clause 0x2
; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
; GFX12-NOEXPAND-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NOEXPAND-NEXT: s_wait_storecnt 0x0
; GFX12-NOEXPAND-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid64 = zext i32 %tid to i64
; Issue multiple stores
%ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64
store i32 1, ptr addrspace(1) %ptr0, align 4
%offset1 = add i64 %tid64, 64
%ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1
store i32 2, ptr addrspace(1) %ptr1, align 4
%offset2 = add i64 %tid64, 128
%ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2
store i32 3, ptr addrspace(1) %ptr2, align 4
; Memory fence forces wait for all stores
fence release
ret void
}
define amdgpu_ps void @test_expcnt_exports(float %x, float %y, float %z, float %w) #0 {
; Test export operations (EXP_CNT/expcnt)
; GFX9-EXPAND-LABEL: test_expcnt_exports:
; GFX9-EXPAND: ; %bb.0: ; %entry
; GFX9-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3
; GFX9-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0
; GFX9-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2
; GFX9-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done
; GFX9-EXPAND-NEXT: s_endpgm
;
; GFX9-NOEXPAND-LABEL: test_expcnt_exports:
; GFX9-NOEXPAND: ; %bb.0: ; %entry
; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3
; GFX9-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0
; GFX9-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2
; GFX9-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done
; GFX9-NOEXPAND-NEXT: s_endpgm
;
; GFX10-EXPAND-LABEL: test_expcnt_exports:
; GFX10-EXPAND: ; %bb.0: ; %entry
; GFX10-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
; GFX10-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3
; GFX10-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0
; GFX10-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2
; GFX10-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done
; GFX10-EXPAND-NEXT: s_endpgm
;
; GFX10-NOEXPAND-LABEL: test_expcnt_exports:
; GFX10-NOEXPAND: ; %bb.0: ; %entry
; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
; GFX10-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3
; GFX10-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0
; GFX10-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2
; GFX10-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done
; GFX10-NOEXPAND-NEXT: s_endpgm
;
; GFX11-EXPAND-LABEL: test_expcnt_exports:
; GFX11-EXPAND: ; %bb.0: ; %entry
; GFX11-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3
; GFX11-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0
; GFX11-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2
; GFX11-EXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done
; GFX11-EXPAND-NEXT: s_endpgm
;
; GFX11-NOEXPAND-LABEL: test_expcnt_exports:
; GFX11-NOEXPAND: ; %bb.0: ; %entry
; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3
; GFX11-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0
; GFX11-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2
; GFX11-NOEXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done
; GFX11-NOEXPAND-NEXT: s_endpgm
;
; GFX12-EXPAND-LABEL: test_expcnt_exports:
; GFX12-EXPAND: ; %bb.0: ; %entry
; GFX12-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
; GFX12-EXPAND-NEXT: export mrt0 v0, v1, v2, v3
; GFX12-EXPAND-NEXT: export mrt1 v3, v2, v1, v0
; GFX12-EXPAND-NEXT: export mrt2 v0, v3, v1, v2
; GFX12-EXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done
; GFX12-EXPAND-NEXT: s_endpgm
;
; GFX12-NOEXPAND-LABEL: test_expcnt_exports:
; GFX12-NOEXPAND: ; %bb.0: ; %entry
; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
; GFX12-NOEXPAND-NEXT: export mrt0 v0, v1, v2, v3
; GFX12-NOEXPAND-NEXT: export mrt1 v3, v2, v1, v0
; GFX12-NOEXPAND-NEXT: export mrt2 v0, v3, v1, v2
; GFX12-NOEXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done
; GFX12-NOEXPAND-NEXT: s_endpgm
entry:
; Multiple MRT exports
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false)
call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %w, float %z, float %y, float %x, i1 false, i1 false)
call void @llvm.amdgcn.exp.f32(i32 2, i32 15, float %x, float %w, float %y, float %z, i1 false, i1 false)
; Final export with done bit
call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 true, i1 false)
ret void
}
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
attributes #0 = { nounwind ATTRS }