| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX9 %s |
| ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX10 %s |
| ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX9 %s |
| ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX10 %s |
| |
| ; Test case looks at the allocated offset of @used_by_both. It's at zero when |
| ; allocated by itself, but at 8 when allocated in combination with the double. |
| ; Redundantly also checks LDSByteSize. |
| @used_by_both = addrspace(3) global i32 undef |
| @used_by_kernel = addrspace(3) global i32 undef |
| @used_by_function = addrspace(3) global double undef |
| |
| ; kernel that calls no functions and uses an LDS variable allocates only that |
| ; variable, so accesses at at offset 0 and LDSByteSize is 4 |
| define amdgpu_kernel void @nocall_ideal() { |
| ; CHECK-LABEL: nocall_ideal: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: ds_write_b32 v0, v0 |
| ; CHECK-NEXT: s_endpgm |
| store i32 0, ptr addrspace(3) @used_by_kernel |
| ret void |
| } |
| ; CHECK: ; LDSByteSize: 4 bytes |
| |
| ; Needs to allocate both variables, store to used_by_both is at sizeof(double) |
| define amdgpu_kernel void @withcall() { |
| ; GFX9-LABEL: withcall: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 |
| ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 |
| ; GFX9-NEXT: s_mov_b32 s10, -1 |
| ; GFX9-NEXT: s_mov_b32 s11, 0xe00000 |
| ; GFX9-NEXT: s_add_u32 s8, s8, s3 |
| ; GFX9-NEXT: s_addc_u32 s9, s9, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[2:3] |
| ; GFX9-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 |
| ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] |
| ; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; GFX9-NEXT: s_mov_b32 s32, 0 |
| ; GFX9-NEXT: ds_write_b32 v0, v0 offset:8 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: withcall: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 |
| ; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 |
| ; GFX10-NEXT: s_mov_b32 s10, -1 |
| ; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 |
| ; GFX10-NEXT: s_add_u32 s8, s8, s3 |
| ; GFX10-NEXT: s_addc_u32 s9, s9, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[2:3] |
| ; GFX10-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 |
| ; GFX10-NEXT: s_mov_b64 s[6:7], s[0:1] |
| ; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9] |
| ; GFX10-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; GFX10-NEXT: s_mov_b32 s32, 0 |
| ; GFX10-NEXT: ds_write_b32 v0, v0 offset:8 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; G_GFX9-LABEL: withcall: |
| ; G_GFX9: ; %bb.0: |
| ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 |
| ; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 |
| ; G_GFX9-NEXT: s_mov_b32 s10, -1 |
| ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 |
| ; G_GFX9-NEXT: s_add_u32 s8, s8, s3 |
| ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 |
| ; G_GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] |
| ; G_GFX9-NEXT: s_getpc_b64 s[0:1] |
| ; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 |
| ; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 |
| ; G_GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; G_GFX9-NEXT: s_mov_b64 s[0:1], s[8:9] |
| ; G_GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; G_GFX9-NEXT: v_mov_b32_e32 v1, 8 |
| ; G_GFX9-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; G_GFX9-NEXT: s_mov_b32 s32, 0 |
| ; G_GFX9-NEXT: ds_write_b32 v1, v0 |
| ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; G_GFX9-NEXT: s_endpgm |
| ; |
| ; G_GFX10-LABEL: withcall: |
| ; G_GFX10: ; %bb.0: |
| ; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 |
| ; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 |
| ; G_GFX10-NEXT: s_mov_b32 s10, -1 |
| ; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000 |
| ; G_GFX10-NEXT: s_add_u32 s8, s8, s3 |
| ; G_GFX10-NEXT: s_addc_u32 s9, s9, 0 |
| ; G_GFX10-NEXT: s_mov_b64 s[6:7], s[0:1] |
| ; G_GFX10-NEXT: s_getpc_b64 s[0:1] |
| ; G_GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 |
| ; G_GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 |
| ; G_GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; G_GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; G_GFX10-NEXT: v_mov_b32_e32 v1, 8 |
| ; G_GFX10-NEXT: s_mov_b64 s[0:1], s[8:9] |
| ; G_GFX10-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; G_GFX10-NEXT: s_mov_b32 s32, 0 |
| ; G_GFX10-NEXT: ds_write_b32 v1, v0 |
| ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; G_GFX10-NEXT: s_endpgm |
| store i32 0, ptr addrspace(3) @used_by_both |
| call void @nonkernel() |
| ret void |
| } |
| ; CHECK: ; LDSByteSize: 16 bytes |
| |
| ; Previous lowering was less efficient here than necessary as the i32 used |
| ; by the kernel is also used by an unrelated non-kernel function. Codegen |
| ; is now the same as nocall_ideal. |
| define amdgpu_kernel void @nocall_false_sharing() { |
| ; CHECK-LABEL: nocall_false_sharing: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: ds_write_b32 v0, v0 |
| ; CHECK-NEXT: s_endpgm |
| store i32 0, ptr addrspace(3) @used_by_both |
| ret void |
| } |
| ; CHECK: ; LDSByteSize: 4 bytes |
| |
| |
| define void @nonkernel() { |
| ; GFX9-LABEL: nonkernel: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, v0 |
| ; GFX9-NEXT: ds_write_b32 v0, v0 offset:8 |
| ; GFX9-NEXT: ds_write_b64 v0, v[0:1] |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: nonkernel: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, v0 |
| ; GFX10-NEXT: ds_write_b32 v0, v0 offset:8 |
| ; GFX10-NEXT: ds_write_b64 v0, v[0:1] |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; G_GFX9-LABEL: nonkernel: |
| ; G_GFX9: ; %bb.0: |
| ; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; G_GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; G_GFX9-NEXT: v_mov_b32_e32 v3, 8 |
| ; G_GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; G_GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; G_GFX9-NEXT: ds_write_b32 v3, v2 |
| ; G_GFX9-NEXT: ds_write_b64 v2, v[0:1] |
| ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; G_GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; G_GFX10-LABEL: nonkernel: |
| ; G_GFX10: ; %bb.0: |
| ; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; G_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; G_GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; G_GFX10-NEXT: v_mov_b32_e32 v3, 8 |
| ; G_GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; G_GFX10-NEXT: ds_write_b32 v3, v2 |
| ; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] |
| ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; G_GFX10-NEXT: s_setpc_b64 s[30:31] |
| store i32 0, ptr addrspace(3) @used_by_both |
| store double 0.0, ptr addrspace(3) @used_by_function |
| ret void |
| } |