| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefix=CI %s |
| ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s |
| |
| %struct.lds = type { [64 x ptr], [16 x i8] } |
| @stored_lds_struct = addrspace(3) global %struct.lds poison, align 16 |
| @stored_lds_ptr = addrspace(3) global ptr addrspace(3) poison, align 4 |
| @stored_constant_ptr = addrspace(3) global ptr addrspace(4) poison, align 8 |
| @stored_global_ptr = addrspace(3) global ptr addrspace(1) poison, align 8 |
| |
| define amdgpu_kernel void @no_reorder_flat_load_local_store_local_load(ptr addrspace(3) %out, ptr %fptr) #0 { |
| ; CI-LABEL: no_reorder_flat_load_local_store_local_load: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb |
| ; CI-NEXT: v_mov_b32_e32 v4, 0 |
| ; CI-NEXT: s_mov_b32 m0, -1 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: v_mov_b32_e32 v0, s0 |
| ; CI-NEXT: v_mov_b32_e32 v1, s1 |
| ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] |
| ; CI-NEXT: s_load_dword s0, s[4:5], 0x9 |
| ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CI-NEXT: ds_write_b128 v4, v[0:3] offset:512 |
| ; CI-NEXT: ds_read2_b32 v[0:1], v4 offset0:129 offset1:130 |
| ; CI-NEXT: v_mov_b32_e32 v2, s0 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: no_reorder_flat_load_local_store_local_load: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: flat_load_dwordx4 v[0:3], v[0:1] |
| ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: ds_write_b128 v4, v[0:3] offset:512 |
| ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:129 offset1:130 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 |
| ; GFX9-NEXT: s_endpgm |
| %ptr1 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1 |
| %ptr2 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1, i32 4 |
| call void @llvm.memcpy.p3.p0(ptr addrspace(3) align 16 %ptr1, ptr align 8 %fptr, i64 16, i1 false) |
| %vector_load = load <2 x i32>, ptr addrspace(3) %ptr2, align 4 |
| store <2 x i32> %vector_load, ptr addrspace(3) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @reorder_local_load_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 { |
| ; CI-LABEL: reorder_local_load_global_store_local_load: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: v_mov_b32_e32 v0, 0 |
| ; CI-NEXT: s_mov_b32 m0, -1 |
| ; CI-NEXT: ds_read_b32 v0, v0 |
| ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; CI-NEXT: s_mov_b32 s7, 0xf000 |
| ; CI-NEXT: s_mov_b32 s6, -1 |
| ; CI-NEXT: v_mov_b32_e32 v2, 0x63 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset0:1 offset1:3 |
| ; CI-NEXT: s_mov_b32 s4, s0 |
| ; CI-NEXT: s_mov_b32 s5, s1 |
| ; CI-NEXT: s_mov_b32 s0, s2 |
| ; CI-NEXT: s_mov_b32 s1, s3 |
| ; CI-NEXT: s_mov_b32 s2, s6 |
| ; CI-NEXT: s_mov_b32 s3, s7 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 |
| ; CI-NEXT: buffer_store_dword v2, off, s[0:3], 0 |
| ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: reorder_local_load_global_store_local_load: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: ds_read_b32 v0, v2 |
| ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 0x63 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset0:1 offset1:3 |
| ; GFX9-NEXT: global_store_dword v2, v3, s[2:3] |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 |
| ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] |
| ; GFX9-NEXT: s_endpgm |
| %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4 |
| |
| %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1 |
| %ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3 |
| |
| %tmp1 = load i32, ptr addrspace(3) %ptr1, align 4 |
| store i32 99, ptr addrspace(1) %gptr, align 4 |
| %tmp2 = load i32, ptr addrspace(3) %ptr2, align 4 |
| |
| %add = add nsw i32 %tmp1, %tmp2 |
| |
| store i32 %add, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 { |
| ; CI-LABEL: no_reorder_local_load_volatile_global_store_local_load: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; CI-NEXT: v_mov_b32_e32 v0, 0 |
| ; CI-NEXT: s_mov_b32 m0, -1 |
| ; CI-NEXT: ds_read_b32 v0, v0 |
| ; CI-NEXT: s_mov_b32 s7, 0xf000 |
| ; CI-NEXT: s_mov_b32 s6, -1 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: s_mov_b32 s8, s2 |
| ; CI-NEXT: s_mov_b32 s9, s3 |
| ; CI-NEXT: s_mov_b32 s10, s6 |
| ; CI-NEXT: s_mov_b32 s11, s7 |
| ; CI-NEXT: v_mov_b32_e32 v2, 0x63 |
| ; CI-NEXT: ds_read_b32 v1, v0 offset:4 |
| ; CI-NEXT: buffer_store_dword v2, off, s[8:11], 0 |
| ; CI-NEXT: s_waitcnt vmcnt(0) |
| ; CI-NEXT: ds_read_b32 v0, v0 offset:12 |
| ; CI-NEXT: s_mov_b32 s4, s0 |
| ; CI-NEXT: s_mov_b32 s5, s1 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 |
| ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: no_reorder_local_load_volatile_global_store_local_load: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: ds_read_b32 v1, v0 |
| ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0x63 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: ds_read_b32 v3, v1 offset:4 |
| ; GFX9-NEXT: global_store_dword v0, v2, s[2:3] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ds_read_b32 v1, v1 offset:12 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 |
| ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX9-NEXT: s_endpgm |
| %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4 |
| |
| %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1 |
| %ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3 |
| |
| %tmp1 = load i32, ptr addrspace(3) %ptr1, align 4 |
| store volatile i32 99, ptr addrspace(1) %gptr, align 4 |
| %tmp2 = load i32, ptr addrspace(3) %ptr2, align 4 |
| |
| %add = add nsw i32 %tmp1, %tmp2 |
| |
| store i32 %add, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 { |
| ; CI-LABEL: no_reorder_barrier_local_load_global_store_local_load: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; CI-NEXT: v_mov_b32_e32 v0, 0 |
| ; CI-NEXT: s_mov_b32 m0, -1 |
| ; CI-NEXT: ds_read_b32 v0, v0 |
| ; CI-NEXT: s_mov_b32 s7, 0xf000 |
| ; CI-NEXT: s_mov_b32 s6, -1 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: s_mov_b32 s8, s2 |
| ; CI-NEXT: s_mov_b32 s9, s3 |
| ; CI-NEXT: s_mov_b32 s10, s6 |
| ; CI-NEXT: s_mov_b32 s11, s7 |
| ; CI-NEXT: v_mov_b32_e32 v2, 0x63 |
| ; CI-NEXT: ds_read_b32 v1, v0 offset:4 |
| ; CI-NEXT: buffer_store_dword v2, off, s[8:11], 0 |
| ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CI-NEXT: s_barrier |
| ; CI-NEXT: ds_read_b32 v0, v0 offset:12 |
| ; CI-NEXT: s_mov_b32 s4, s0 |
| ; CI-NEXT: s_mov_b32 s5, s1 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 |
| ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: no_reorder_barrier_local_load_global_store_local_load: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: ds_read_b32 v1, v0 |
| ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0x63 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: ds_read_b32 v3, v1 offset:4 |
| ; GFX9-NEXT: global_store_dword v0, v2, s[2:3] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_barrier |
| ; GFX9-NEXT: ds_read_b32 v1, v1 offset:12 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 |
| ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX9-NEXT: s_endpgm |
| %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4 |
| |
| %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1 |
| %ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3 |
| |
| %tmp1 = load i32, ptr addrspace(3) %ptr1, align 4 |
| store i32 99, ptr addrspace(1) %gptr, align 4 |
| call void @llvm.amdgcn.s.barrier() #1 |
| %tmp2 = load i32, ptr addrspace(3) %ptr2, align 4 |
| |
| %add = add nsw i32 %tmp1, %tmp2 |
| |
| store i32 %add, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 { |
| ; CI-LABEL: reorder_constant_load_global_store_constant_load: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: v_mov_b32_e32 v0, 0 |
| ; CI-NEXT: s_mov_b32 m0, -1 |
| ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; CI-NEXT: ds_read_b64 v[0:1], v0 |
| ; CI-NEXT: s_mov_b32 s7, 0xf000 |
| ; CI-NEXT: s_mov_b32 s6, -1 |
| ; CI-NEXT: s_mov_b32 s10, s6 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: s_mov_b32 s8, s2 |
| ; CI-NEXT: s_mov_b32 s9, s3 |
| ; CI-NEXT: s_mov_b32 s11, s7 |
| ; CI-NEXT: v_readfirstlane_b32 s2, v0 |
| ; CI-NEXT: v_readfirstlane_b32 s3, v1 |
| ; CI-NEXT: v_mov_b32_e32 v0, 0x63 |
| ; CI-NEXT: s_load_dword s12, s[2:3], 0x1 |
| ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 |
| ; CI-NEXT: s_load_dword s2, s[2:3], 0x3 |
| ; CI-NEXT: s_mov_b32 s4, s0 |
| ; CI-NEXT: s_mov_b32 s5, s1 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: s_add_i32 s0, s12, s2 |
| ; CI-NEXT: v_mov_b32_e32 v0, s0 |
| ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: reorder_constant_load_global_store_constant_load: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: ds_read_b64 v[0:1], v2 |
| ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 0x63 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 |
| ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x4 |
| ; GFX9-NEXT: global_store_dword v2, v3, s[2:3] |
| ; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_add_i32 s2, s6, s2 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] |
| ; GFX9-NEXT: s_endpgm |
| %ptr0 = load ptr addrspace(4), ptr addrspace(3) @stored_constant_ptr, align 8 |
| |
| %ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1 |
| %ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 3 |
| |
| %tmp1 = load i32, ptr addrspace(4) %ptr1, align 4 |
| store i32 99, ptr addrspace(1) %gptr, align 4 |
| %tmp2 = load i32, ptr addrspace(4) %ptr2, align 4 |
| |
| %add = add nsw i32 %tmp1, %tmp2 |
| |
| store i32 %add, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(ptr addrspace(1) %out, ptr addrspace(3) %lptr) #0 { |
| ; CI-LABEL: reorder_constant_load_local_store_constant_load: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: v_mov_b32_e32 v0, 0 |
| ; CI-NEXT: s_mov_b32 m0, -1 |
| ; CI-NEXT: ds_read_b64 v[0:1], v0 |
| ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; CI-NEXT: s_load_dword s6, s[4:5], 0xb |
| ; CI-NEXT: s_mov_b32 s3, 0xf000 |
| ; CI-NEXT: s_mov_b32 s2, -1 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: v_readfirstlane_b32 s4, v0 |
| ; CI-NEXT: v_readfirstlane_b32 s5, v1 |
| ; CI-NEXT: s_load_dword s7, s[4:5], 0x1 |
| ; CI-NEXT: s_load_dword s4, s[4:5], 0x3 |
| ; CI-NEXT: v_mov_b32_e32 v0, 0x63 |
| ; CI-NEXT: v_mov_b32_e32 v1, s6 |
| ; CI-NEXT: ds_write_b32 v1, v0 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: s_add_i32 s4, s7, s4 |
| ; CI-NEXT: v_mov_b32_e32 v0, s4 |
| ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: reorder_constant_load_local_store_constant_load: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: ds_read_b64 v[0:1], v2 |
| ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX9-NEXT: s_load_dword s7, s[0:1], 0x4 |
| ; GFX9-NEXT: s_load_dword s8, s[0:1], 0xc |
| ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x63 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s6 |
| ; GFX9-NEXT: ds_write_b32 v1, v0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_add_i32 s0, s7, s8 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] |
| ; GFX9-NEXT: s_endpgm |
| %ptr0 = load ptr addrspace(4), ptr addrspace(3) @stored_constant_ptr, align 8 |
| |
| %ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1 |
| %ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 3 |
| |
| %tmp1 = load i32, ptr addrspace(4) %ptr1, align 4 |
| store i32 99, ptr addrspace(3) %lptr, align 4 |
| %tmp2 = load i32, ptr addrspace(4) %ptr2, align 4 |
| |
| %add = add nsw i32 %tmp1, %tmp2 |
| |
| store i32 %add, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(ptr addrspace(1) %out, ptr addrspace(3) noalias %lptr, ptr addrspace(4) %ptr0) #0 { |
| ; CI-LABEL: reorder_smrd_load_local_store_smrd_load: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd |
| ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; CI-NEXT: s_load_dword s8, s[4:5], 0xb |
| ; CI-NEXT: v_mov_b32_e32 v0, 0x63 |
| ; CI-NEXT: s_mov_b32 m0, -1 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x1 |
| ; CI-NEXT: s_mov_b32 s3, 0xf000 |
| ; CI-NEXT: v_mov_b32_e32 v1, s8 |
| ; CI-NEXT: s_mov_b32 s2, -1 |
| ; CI-NEXT: ds_write_b32 v1, v0 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: s_add_i32 s4, s4, s5 |
| ; CI-NEXT: v_mov_b32_e32 v0, s4 |
| ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: reorder_smrd_load_local_store_smrd_load: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c |
| ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0x63 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s6 |
| ; GFX9-NEXT: ds_write_b32 v2, v1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_add_i32 s0, s4, s5 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] |
| ; GFX9-NEXT: s_endpgm |
| %ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1 |
| %ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 2 |
| |
| %tmp1 = load i32, ptr addrspace(4) %ptr1, align 4 |
| store i32 99, ptr addrspace(3) %lptr, align 4 |
| %tmp2 = load i32, ptr addrspace(4) %ptr2, align 4 |
| |
| %add = add nsw i32 %tmp1, %tmp2 |
| |
| store i32 %add, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @reorder_global_load_local_store_global_load(ptr addrspace(1) %out, ptr addrspace(3) %lptr, ptr addrspace(1) %ptr0) #0 { |
| ; CI-LABEL: reorder_global_load_local_store_global_load: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd |
| ; CI-NEXT: s_mov_b32 s7, 0xf000 |
| ; CI-NEXT: s_mov_b32 s6, -1 |
| ; CI-NEXT: s_mov_b32 s2, s6 |
| ; CI-NEXT: s_mov_b32 s3, s7 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 |
| ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12 |
| ; CI-NEXT: s_load_dword s0, s[4:5], 0xb |
| ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 |
| ; CI-NEXT: v_mov_b32_e32 v2, 0x63 |
| ; CI-NEXT: s_mov_b32 m0, -1 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: v_mov_b32_e32 v3, s0 |
| ; CI-NEXT: ds_write_b32 v3, v2 |
| ; CI-NEXT: s_waitcnt vmcnt(0) |
| ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 |
| ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: reorder_global_load_local_store_global_load: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 0x63 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:4 |
| ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:12 |
| ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX9-NEXT: ds_write_b32 v4, v3 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 |
| ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX9-NEXT: s_endpgm |
| %ptr1 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i64 1 |
| %ptr2 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i64 3 |
| |
| %tmp1 = load i32, ptr addrspace(1) %ptr1, align 4 |
| store i32 99, ptr addrspace(3) %lptr, align 4 |
| %tmp2 = load i32, ptr addrspace(1) %ptr2, align 4 |
| |
| %add = add nsw i32 %tmp1, %tmp2 |
| |
| store i32 %add, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @reorder_local_offsets(ptr addrspace(1) nocapture %out, ptr addrspace(1) noalias nocapture readnone %gptr, ptr addrspace(3) noalias nocapture %ptr0) #0 { |
| ; CI-LABEL: reorder_local_offsets: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: s_load_dword s6, s[4:5], 0xd |
| ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; CI-NEXT: s_mov_b32 m0, -1 |
| ; CI-NEXT: v_mov_b32_e32 v2, 0x7b |
| ; CI-NEXT: s_mov_b32 s3, 0xf000 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: v_mov_b32_e32 v3, s6 |
| ; CI-NEXT: ds_read2_b32 v[0:1], v3 offset0:100 offset1:102 |
| ; CI-NEXT: s_mov_b32 s2, -1 |
| ; CI-NEXT: ds_write2_b32 v3, v2, v2 offset0:3 offset1:100 |
| ; CI-NEXT: v_mov_b32_e32 v2, 0x315 |
| ; CI-NEXT: ds_write_b32 v3, v2 offset:408 |
| ; CI-NEXT: s_waitcnt lgkmcnt(2) |
| ; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 |
| ; CI-NEXT: v_add_i32_e32 v0, vcc, 0x7b, v0 |
| ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: reorder_local_offsets: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x34 |
| ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s2 |
| ; GFX9-NEXT: ds_read2_b32 v[0:1], v3 offset0:100 offset1:102 |
| ; GFX9-NEXT: ds_write2_b32 v3, v4, v4 offset0:3 offset1:100 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x315 |
| ; GFX9-NEXT: ds_write_b32 v3, v4 offset:408 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(2) |
| ; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 |
| ; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0 |
| ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] |
| ; GFX9-NEXT: s_endpgm |
| %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3 |
| %ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 100 |
| %ptr3 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 102 |
| |
| store i32 123, ptr addrspace(3) %ptr1, align 4 |
| %tmp1 = load i32, ptr addrspace(3) %ptr2, align 4 |
| %tmp2 = load i32, ptr addrspace(3) %ptr3, align 4 |
| store i32 123, ptr addrspace(3) %ptr2, align 4 |
| %tmp3 = load i32, ptr addrspace(3) %ptr1, align 4 |
| store i32 789, ptr addrspace(3) %ptr3, align 4 |
| |
| %add.0 = add nsw i32 %tmp2, %tmp1 |
| %add.1 = add nsw i32 %add.0, %tmp3 |
| store i32 %add.1, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @reorder_global_offsets(ptr addrspace(1) nocapture %out, ptr addrspace(1) noalias nocapture readnone %gptr, ptr addrspace(1) noalias nocapture %ptr0) #0 { |
| ; CI-LABEL: reorder_global_offsets: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd |
| ; CI-NEXT: s_mov_b32 s7, 0xf000 |
| ; CI-NEXT: s_mov_b32 s6, -1 |
| ; CI-NEXT: s_mov_b32 s2, s6 |
| ; CI-NEXT: s_mov_b32 s3, s7 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:400 |
| ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:408 |
| ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 |
| ; CI-NEXT: v_mov_b32_e32 v2, 0x7b |
| ; CI-NEXT: v_mov_b32_e32 v3, 0x315 |
| ; CI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 |
| ; CI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:400 |
| ; CI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:408 |
| ; CI-NEXT: s_waitcnt vmcnt(3) |
| ; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 |
| ; CI-NEXT: v_add_i32_e32 v0, vcc, 0x7b, v0 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: reorder_global_offsets: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:400 |
| ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:408 |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: global_store_dword v0, v3, s[0:1] offset:12 |
| ; GFX9-NEXT: global_store_dword v0, v3, s[0:1] offset:400 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 0x315 |
| ; GFX9-NEXT: global_store_dword v0, v3, s[0:1] offset:408 |
| ; GFX9-NEXT: s_waitcnt vmcnt(3) |
| ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 |
| ; GFX9-NEXT: v_add_u32_e32 v1, 0x7b, v1 |
| ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] |
| ; GFX9-NEXT: s_endpgm |
| %ptr1 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 3 |
| %ptr2 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 100 |
| %ptr3 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 102 |
| |
| store i32 123, ptr addrspace(1) %ptr1, align 4 |
| %tmp1 = load i32, ptr addrspace(1) %ptr2, align 4 |
| %tmp2 = load i32, ptr addrspace(1) %ptr3, align 4 |
| store i32 123, ptr addrspace(1) %ptr2, align 4 |
| %tmp3 = load i32, ptr addrspace(1) %ptr1, align 4 |
| store i32 789, ptr addrspace(1) %ptr3, align 4 |
| |
| %add.0 = add nsw i32 %tmp2, %tmp1 |
| %add.1 = add nsw i32 %add.0, %tmp3 |
| store i32 %add.1, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(ptr addrspace(1) noalias nocapture %ptr.base) #0 { |
| ; CI-LABEL: reorder_global_offsets_addr64_soffset0: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; CI-NEXT: s_mov_b32 s3, 0xf000 |
| ; CI-NEXT: s_mov_b32 s2, 0 |
| ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; CI-NEXT: v_mov_b32_e32 v1, 0 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:12 |
| ; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:28 |
| ; CI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:44 |
| ; CI-NEXT: v_mov_b32_e32 v5, 0x315 |
| ; CI-NEXT: v_mov_b32_e32 v6, 0x7b |
| ; CI-NEXT: buffer_store_dword v5, v[0:1], s[0:3], 0 addr64 |
| ; CI-NEXT: buffer_store_dword v6, v[0:1], s[0:3], 0 addr64 offset:20 |
| ; CI-NEXT: s_waitcnt vmcnt(3) |
| ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 |
| ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:36 |
| ; CI-NEXT: s_waitcnt vmcnt(3) |
| ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 |
| ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:52 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: reorder_global_offsets_addr64_soffset0: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x315 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7b |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:12 |
| ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:28 |
| ; GFX9-NEXT: global_load_dword v3, v0, s[0:1] offset:44 |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: global_store_dword v0, v4, s[0:1] |
| ; GFX9-NEXT: global_store_dword v0, v5, s[0:1] offset:20 |
| ; GFX9-NEXT: s_waitcnt vmcnt(3) |
| ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 |
| ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:36 |
| ; GFX9-NEXT: s_waitcnt vmcnt(3) |
| ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 |
| ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:52 |
| ; GFX9-NEXT: s_endpgm |
| %id = call i32 @llvm.amdgcn.workitem.id.x() |
| %id.ext = sext i32 %id to i64 |
| |
| %ptr0 = getelementptr inbounds i32, ptr addrspace(1) %ptr.base, i64 %id.ext |
| %ptr1 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 3 |
| %ptr2 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 5 |
| %ptr3 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 7 |
| %ptr4 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 9 |
| %ptr5 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 11 |
| %ptr6 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 13 |
| |
| store i32 789, ptr addrspace(1) %ptr0, align 4 |
| %tmp1 = load i32, ptr addrspace(1) %ptr1, align 4 |
| store i32 123, ptr addrspace(1) %ptr2, align 4 |
| %tmp2 = load i32, ptr addrspace(1) %ptr3, align 4 |
| %add.0 = add nsw i32 %tmp1, %tmp2 |
| store i32 %add.0, ptr addrspace(1) %ptr4, align 4 |
| %tmp3 = load i32, ptr addrspace(1) %ptr5, align 4 |
| %add.1 = add nsw i32 %add.0, %tmp3 |
| store i32 %add.1, ptr addrspace(1) %ptr6, align 4 |
| ret void |
| } |
| |
| define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(ptr addrspace(1) %out, i32 %a1, i32 %vaddr) #0 { |
| ; CI-LABEL: reorder_local_load_tbuffer_store_local_load: |
| ; CI: ; %bb.0: |
| ; CI-NEXT: v_mov_b32_e32 v4, stored_lds_ptr@abs32@lo |
| ; CI-NEXT: s_mov_b32 m0, -1 |
| ; CI-NEXT: ds_read_b32 v4, v4 |
| ; CI-NEXT: s_mov_b32 s2, 0 |
| ; CI-NEXT: s_mov_b32 s3, 0xf000 |
| ; CI-NEXT: s_mov_b32 s0, s2 |
| ; CI-NEXT: s_mov_b32 s1, s2 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: ds_read2_b32 v[4:5], v4 offset0:1 offset1:2 |
| ; CI-NEXT: v_add_i32_e32 v3, vcc, 32, v3 |
| ; CI-NEXT: s_waitcnt lgkmcnt(0) |
| ; CI-NEXT: tbuffer_store_format_xyzw v[2:5], v3, s[0:3], 0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_SNORM_OGL] idxen glc slc |
| ; CI-NEXT: s_nop 0 |
| ; CI-NEXT: v_add_i32_e32 v2, vcc, v4, v5 |
| ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; CI-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: reorder_local_load_tbuffer_store_local_load: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v4, stored_lds_ptr@abs32@lo |
| ; GFX9-NEXT: ds_read_b32 v4, v4 |
| ; GFX9-NEXT: v_add_u32_e32 v3, 32, v3 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset0:1 offset1:2 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: tbuffer_store_format_xyzw v[2:5], v3, s[0:3], 0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_RESERVED_6] idxen glc slc |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: v_add_u32_e32 v2, v4, v5 |
| ; GFX9-NEXT: global_store_dword v[0:1], v2, off |
| ; GFX9-NEXT: s_endpgm |
| %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4 |
| |
| %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1 |
| %ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 2 |
| |
| %tmp1 = load i32, ptr addrspace(3) %ptr1, align 4 |
| |
| %vdata = insertelement <4 x i32> poison, i32 %a1, i32 0 |
| %vaddr.add = add i32 %vaddr, 32 |
| call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4i32(<4 x i32> %vdata, ptr addrspace(8) poison, i32 %vaddr.add, i32 0, i32 0, i32 228, i32 3) |
| |
| %tmp2 = load i32, ptr addrspace(3) %ptr2, align 4 |
| |
| %add = add nsw i32 %tmp1, %tmp2 |
| store i32 %add, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| declare void @llvm.memcpy.p3.p0(ptr addrspace(3), ptr, i64, i1) |
| declare void @llvm.amdgcn.s.barrier() #1 |
| declare i32 @llvm.amdgcn.workitem.id.x() #2 |
| declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #3 |
| |
| attributes #0 = { nounwind } |
| attributes #1 = { convergent nounwind willreturn } |
| attributes #2 = { nounwind readnone speculatable willreturn } |
| attributes #3 = { nounwind willreturn writeonly } |