| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-MUBUF %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-FLATSCR %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-MUBUF %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-FLATSCR %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s |
| |
| ; This test checks memory addresses with constant offset components that should |
| ; not be folded into memory accesses with immediate offsets. |
| ; SeparateConstOffsetsFromGEP transforms the GEPs in a way that can lead to |
| ; out-of-bounds or negative intermediate results in the address computation, |
| ; which are problematic for flat and scratch instructions: |
| ; gep[inbounds](p, i + 3) -> gep(gep(p, i), 3) |
| |
| |
| ; FIXME the offset here should not be folded: if %p points to the beginning of |
| ; scratch or LDS and %i is -1, a folded offset crashes the program. |
| define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { |
| ; GFX90A-LABEL: flat_offset_maybe_oob: |
| ; GFX90A: ; %bb.0: |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] |
| ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc |
| ; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: flat_offset_maybe_oob: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo |
| ; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX942-LABEL: flat_offset_maybe_oob: |
| ; GFX942: ; %bb.0: |
| ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] |
| ; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 |
| ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: flat_offset_maybe_oob: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo |
| ; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: flat_offset_maybe_oob: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] |
| ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX12-NEXT: s_wait_alu 0xfffd |
| ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo |
| ; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_alu 0xfffd |
| ; GFX12-NEXT: s_setpc_b64 s[30:31] |
| %idx = add nsw i32 %i, 3 |
| %arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx |
| %l = load i32, ptr %arrayidx |
| ret i32 %l |
| } |
| |
| ; For MUBUF and for GFX12, folding the offset is okay. |
| define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { |
| ; GFX90A-MUBUF-LABEL: private_offset_maybe_oob: |
| ; GFX90A-MUBUF: ; %bb.0: |
| ; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 |
| ; GFX90A-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 |
| ; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-MUBUF-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX90A-FLATSCR-LABEL: private_offset_maybe_oob: |
| ; GFX90A-FLATSCR: ; %bb.0: |
| ; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1 |
| ; GFX90A-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12 |
| ; GFX90A-FLATSCR-NEXT: scratch_load_dword v0, v0, off |
| ; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-MUBUF-LABEL: private_offset_maybe_oob: |
| ; GFX10-MUBUF: ; %bb.0: |
| ; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 |
| ; GFX10-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 |
| ; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-MUBUF-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-FLATSCR-LABEL: private_offset_maybe_oob: |
| ; GFX10-FLATSCR: ; %bb.0: |
| ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1 |
| ; GFX10-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12 |
| ; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v0, off |
| ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX942-LABEL: private_offset_maybe_oob: |
| ; GFX942: ; %bb.0: |
| ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v1 |
| ; GFX942-NEXT: v_add3_u32 v0, v0, v1, 12 |
| ; GFX942-NEXT: scratch_load_dword v0, v0, off |
| ; GFX942-NEXT: s_waitcnt vmcnt(0) |
| ; GFX942-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: private_offset_maybe_oob: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add3_u32 v0, v0, v1, 12 |
| ; GFX11-NEXT: scratch_load_b32 v0, v0, off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: private_offset_maybe_oob: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_lshl_add_u32 v0, v1, 2, v0 |
| ; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:12 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: s_setpc_b64 s[30:31] |
| %idx = add nsw i32 %i, 3 |
| %arrayidx = getelementptr inbounds i32, ptr addrspace(5) %p, i32 %idx |
| %l = load i32, ptr addrspace(5) %arrayidx |
| ret i32 %l |
| } |