| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-SDAG,GFX90A-MUBUF,GFX90A-SDAG-MUBUF %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-SDAG,GFX90A-FLATSCR,GFX90A-SDAG-FLATSCR %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG,GFX10-MUBUF,GFX10-SDAG-MUBUF %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG,GFX10-FLATSCR,GFX10-SDAG-FLATSCR %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s |
| |
| ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-GISEL,GFX90A-MUBUF,GFX90A-GISEL-MUBUF %s |
| ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-GISEL,GFX90A-FLATSCR,GFX90A-GISEL-FLATSCR %s |
| ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL,GFX10-MUBUF,GFX10-GISEL-MUBUF %s |
| ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL,GFX10-FLATSCR,GFX10-GISEL-FLATSCR %s |
| ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s |
| ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s |
| ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s |
| |
| ; This test checks memory addresses with constant offset components that should |
| ; not be folded into memory accesses with immediate offsets. |
| ; SeparateConstOffsetsFromGEP transforms the GEPs in a way that can lead to |
| ; out-of-bounds or negative intermediate results in the address computation, |
| ; which are problematic for flat and scratch instructions: |
| ; gep[inbounds](p, i + 3) -> gep(gep(p, i), 3) |
| |
| |
| ; The offset here cannot be folded: if %p points to the beginning of scratch or |
| ; scratch or LDS and %i is -1, a folded offset crashes the program. |
| define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { |
| ; GFX90A-LABEL: flat_offset_maybe_oob: |
| ; GFX90A: ; %bb.0: |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] |
| ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc |
| ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX90A-NEXT: flat_load_dword v0, v[0:1] |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: flat_offset_maybe_oob: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo |
| ; GFX10-NEXT: flat_load_dword v0, v[0:1] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX942-SDAG-LABEL: flat_offset_maybe_oob: |
| ; GFX942-SDAG: ; %bb.0: |
| ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 12 |
| ; GFX942-SDAG-NEXT: flat_load_dword v0, v[0:1] |
| ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: flat_offset_maybe_oob: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo |
| ; GFX11-NEXT: flat_load_b32 v0, v[0:1] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: flat_offset_maybe_oob: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] |
| ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo |
| ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo |
| ; GFX12-NEXT: flat_load_b32 v0, v[0:1] |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX942-GISEL-LABEL: flat_offset_maybe_oob: |
| ; GFX942-GISEL: ; %bb.0: |
| ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] |
| ; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 |
| ; GFX942-GISEL-NEXT: s_nop 1 |
| ; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX942-GISEL-NEXT: flat_load_dword v0, v[0:1] |
| ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] |
| %idx = add nsw i32 %i, 3 |
| %arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx |
| %l = load i32, ptr %arrayidx |
| ret i32 %l |
| } |
| |
| ; For MUBUF and for GFX12, folding the offset is okay. |
| define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { |
| ; GFX90A-SDAG-MUBUF-LABEL: private_offset_maybe_oob: |
| ; GFX90A-SDAG-MUBUF: ; %bb.0: |
| ; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-SDAG-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 |
| ; GFX90A-SDAG-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 |
| ; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX90A-FLATSCR-LABEL: private_offset_maybe_oob: |
| ; GFX90A-FLATSCR: ; %bb.0: |
| ; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1 |
| ; GFX90A-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12 |
| ; GFX90A-FLATSCR-NEXT: scratch_load_dword v0, v0, off |
| ; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-SDAG-MUBUF-LABEL: private_offset_maybe_oob: |
| ; GFX10-SDAG-MUBUF: ; %bb.0: |
| ; GFX10-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-SDAG-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 |
| ; GFX10-SDAG-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 |
| ; GFX10-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-FLATSCR-LABEL: private_offset_maybe_oob: |
| ; GFX10-FLATSCR: ; %bb.0: |
| ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1 |
| ; GFX10-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12 |
| ; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v0, off |
| ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX942-LABEL: private_offset_maybe_oob: |
| ; GFX942: ; %bb.0: |
| ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v1 |
| ; GFX942-NEXT: v_add3_u32 v0, v0, v1, 12 |
| ; GFX942-NEXT: scratch_load_dword v0, v0, off |
| ; GFX942-NEXT: s_waitcnt vmcnt(0) |
| ; GFX942-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: private_offset_maybe_oob: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add3_u32 v0, v0, v1, 12 |
| ; GFX11-NEXT: scratch_load_b32 v0, v0, off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-SDAG-LABEL: private_offset_maybe_oob: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, v0 |
| ; GFX12-SDAG-NEXT: scratch_load_b32 v0, v0, off offset:12 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX90A-GISEL-MUBUF-LABEL: private_offset_maybe_oob: |
| ; GFX90A-GISEL-MUBUF: ; %bb.0: |
| ; GFX90A-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-GISEL-MUBUF-NEXT: v_lshlrev_b32_e32 v1, 2, v1 |
| ; GFX90A-GISEL-MUBUF-NEXT: v_add_u32_e32 v0, v0, v1 |
| ; GFX90A-GISEL-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 |
| ; GFX90A-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-GISEL-MUBUF-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-GISEL-MUBUF-LABEL: private_offset_maybe_oob: |
| ; GFX10-GISEL-MUBUF: ; %bb.0: |
| ; GFX10-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-GISEL-MUBUF-NEXT: v_lshlrev_b32_e32 v1, 2, v1 |
| ; GFX10-GISEL-MUBUF-NEXT: v_add_nc_u32_e32 v0, v0, v1 |
| ; GFX10-GISEL-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 |
| ; GFX10-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-GISEL-MUBUF-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-GISEL-LABEL: private_offset_maybe_oob: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, v0, v1 |
| ; GFX12-GISEL-NEXT: scratch_load_b32 v0, v0, off offset:12 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] |
| %idx = add nsw i32 %i, 3 |
| %arrayidx = getelementptr inbounds i32, ptr addrspace(5) %p, i32 %idx |
| %l = load i32, ptr addrspace(5) %arrayidx |
| ret i32 %l |
| } |
| |
| ; If the GEP that adds the offset is inbounds, folding the offset is legal. |
| define i32 @flat_offset_inbounds(ptr %p, i32 %i) { |
| ; GFX90A-LABEL: flat_offset_inbounds: |
| ; GFX90A: ; %bb.0: |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] |
| ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc |
| ; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: flat_offset_inbounds: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo |
| ; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX942-LABEL: flat_offset_inbounds: |
| ; GFX942: ; %bb.0: |
| ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] |
| ; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 |
| ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: flat_offset_inbounds: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo |
| ; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: flat_offset_inbounds: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] |
| ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo |
| ; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_setpc_b64 s[30:31] |
| %p.1 = getelementptr inbounds i32, ptr %p, i32 %i |
| %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 |
| %l = load i32, ptr %arrayidx |
| ret i32 %l |
| } |
| |
| define void @flat_offset_inbounds_wide(ptr %p, ptr %pout, i32 %i) { |
| ; GFX90A-SDAG-LABEL: flat_offset_inbounds_wide: |
| ; GFX90A-SDAG: ; %bb.0: |
| ; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX90A-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 |
| ; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc |
| ; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v4, vcc, 28, v0 |
| ; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc |
| ; GFX90A-SDAG-NEXT: flat_load_dword v10, v[4:5] |
| ; GFX90A-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[0:1] offset:12 |
| ; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-SDAG-NEXT: flat_store_dword v[2:3], v10 offset:16 |
| ; GFX90A-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[6:9] |
| ; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-SDAG-LABEL: flat_offset_inbounds_wide: |
| ; GFX10-SDAG: ; %bb.0: |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX10-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX10-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 |
| ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo |
| ; GFX10-SDAG-NEXT: s_clause 0x1 |
| ; GFX10-SDAG-NEXT: flat_load_dword v8, v[4:5] |
| ; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) |
| ; GFX10-SDAG-NEXT: flat_store_dword v[2:3], v8 offset:16 |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) |
| ; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[4:7] |
| ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX942-SDAG-LABEL: flat_offset_inbounds_wide: |
| ; GFX942-SDAG: ; %bb.0: |
| ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 28 |
| ; GFX942-SDAG-NEXT: flat_load_dword v10, v[4:5] |
| ; GFX942-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[0:1] offset:12 |
| ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-SDAG-NEXT: flat_store_dword v[2:3], v10 offset:16 |
| ; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[6:9] |
| ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-SDAG-LABEL: flat_offset_inbounds_wide: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX11-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo |
| ; GFX11-SDAG-NEXT: s_clause 0x1 |
| ; GFX11-SDAG-NEXT: flat_load_b32 v8, v[4:5] |
| ; GFX11-SDAG-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 |
| ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) |
| ; GFX11-SDAG-NEXT: flat_store_b32 v[2:3], v8 offset:16 |
| ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) |
| ; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[4:7] |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-SDAG-LABEL: flat_offset_inbounds_wide: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] |
| ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX12-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 |
| ; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo |
| ; GFX12-SDAG-NEXT: s_clause 0x1 |
| ; GFX12-SDAG-NEXT: flat_load_b32 v8, v[4:5] |
| ; GFX12-SDAG-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x101 |
| ; GFX12-SDAG-NEXT: flat_store_b32 v[2:3], v8 offset:16 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x1 |
| ; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[4:7] |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX90A-GISEL-LABEL: flat_offset_inbounds_wide: |
| ; GFX90A-GISEL: ; %bb.0: |
| ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 |
| ; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc |
| ; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 |
| ; GFX90A-GISEL-NEXT: flat_load_dword v8, v[0:1] offset:28 |
| ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] |
| ; GFX90A-GISEL-NEXT: flat_store_dword v[2:3], v8 offset:16 |
| ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-GISEL-LABEL: flat_offset_inbounds_wide: |
| ; GFX10-GISEL: ; %bb.0: |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX10-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX10-GISEL-NEXT: s_clause 0x1 |
| ; GFX10-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 |
| ; GFX10-GISEL-NEXT: flat_load_dword v0, v[0:1] offset:28 |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) |
| ; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) |
| ; GFX10-GISEL-NEXT: flat_store_dword v[2:3], v0 offset:16 |
| ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX942-GISEL-LABEL: flat_offset_inbounds_wide: |
| ; GFX942-GISEL: ; %bb.0: |
| ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] |
| ; GFX942-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 |
| ; GFX942-GISEL-NEXT: flat_load_dword v8, v[0:1] offset:28 |
| ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] |
| ; GFX942-GISEL-NEXT: flat_store_dword v[2:3], v8 offset:16 |
| ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-GISEL-LABEL: flat_offset_inbounds_wide: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX11-GISEL-NEXT: s_clause 0x1 |
| ; GFX11-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 |
| ; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:28 |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) |
| ; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) |
| ; GFX11-GISEL-NEXT: flat_store_b32 v[2:3], v0 offset:16 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-GISEL-LABEL: flat_offset_inbounds_wide: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX12-GISEL-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX12-GISEL-NEXT: s_clause 0x1 |
| ; GFX12-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 |
| ; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:28 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x101 |
| ; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x1 |
| ; GFX12-GISEL-NEXT: flat_store_b32 v[2:3], v0 offset:16 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] |
| %p.1 = getelementptr inbounds i32, ptr %p, i32 %i |
| %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 |
| %l = load <5 x i32>, ptr %arrayidx |
| store <5 x i32> %l, ptr %pout |
| ret void |
| } |
| |
| define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) { |
| ; GFX90A-SDAG-MUBUF-LABEL: flat_offset_inbounds_very_wide: |
| ; GFX90A-SDAG-MUBUF: ; %bb.0: |
| ; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-SDAG-MUBUF-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX90A-SDAG-MUBUF-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 |
| ; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc |
| ; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v6, s[4:5], 28, v0 |
| ; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v7, s[4:5], 0, v1, s[4:5] |
| ; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v8, s[4:5], 44, v0 |
| ; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, 0x8c, v0 |
| ; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v1, s[4:5] |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[10:13], v[8:9] offset:16 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[14:17], v[8:9] offset:32 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[18:21], v[8:9] offset:48 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[22:25], v[8:9] offset:64 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[26:29], v[8:9] offset:80 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[30:33], v[6:7] |
| ; GFX90A-SDAG-MUBUF-NEXT: s_nop 0 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[6:9], v[8:9] |
| ; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[34:37], v[0:1] offset:12 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[48:51], v[4:5] |
| ; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 |
| ; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc |
| ; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, 0x88, v2 |
| ; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc |
| ; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:48 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:64 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[14:17] offset:64 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:32 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[6:9] offset:32 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[10:13] |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[30:33] offset:16 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[34:37] |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_store_dword v[4:5], v50 |
| ; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128 |
| ; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX90A-SDAG-FLATSCR-LABEL: flat_offset_inbounds_very_wide: |
| ; GFX90A-SDAG-FLATSCR: ; %bb.0: |
| ; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v6, s[0:1], 28, v0 |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v8, s[0:1], 44, v0 |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, 0x8c, v0 |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[10:13], v[8:9] offset:16 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[14:17], v[8:9] offset:32 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[18:21], v[8:9] offset:48 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[22:25], v[8:9] offset:64 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[26:29], v[8:9] offset:80 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[30:33], v[6:7] |
| ; GFX90A-SDAG-FLATSCR-NEXT: s_nop 0 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[6:9], v[8:9] |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[34:37], v[0:1] offset:12 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[48:51], v[4:5] |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, 0x88, v2 |
| ; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc |
| ; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:48 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:64 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[14:17] offset:64 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:32 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[6:9] offset:32 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[10:13] |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[30:33] offset:16 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[34:37] |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dword v[4:5], v50 |
| ; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128 |
| ; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-SDAG-FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-SDAG-LABEL: flat_offset_inbounds_very_wide: |
| ; GFX10-SDAG: ; %bb.0: |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX10-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX10-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 |
| ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo |
| ; GFX10-SDAG-NEXT: s_clause 0x8 |
| ; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[36:37] offset:80 |
| ; GFX10-SDAG-NEXT: flat_load_dwordx4 v[8:11], v[36:37] offset:96 |
| ; GFX10-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[36:37] offset:48 |
| ; GFX10-SDAG-NEXT: flat_load_dwordx4 v[16:19], v[36:37] offset:64 |
| ; GFX10-SDAG-NEXT: flat_load_dwordx4 v[20:23], v[36:37] offset:16 |
| ; GFX10-SDAG-NEXT: flat_load_dwordx4 v[24:27], v[36:37] offset:32 |
| ; GFX10-SDAG-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12 |
| ; GFX10-SDAG-NEXT: flat_load_dwordx4 v[32:35], v[36:37] |
| ; GFX10-SDAG-NEXT: flat_load_dwordx4 v[36:39], v[36:37] offset:112 |
| ; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 |
| ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo |
| ; GFX10-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 |
| ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) |
| ; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:48 |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) |
| ; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:64 |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) |
| ; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64 |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) |
| ; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:32 |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) |
| ; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32 |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) |
| ; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[24:27] |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) |
| ; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[28:31] |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) |
| ; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16 |
| ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) |
| ; GFX10-SDAG-NEXT: flat_store_dword v[48:49], v38 |
| ; GFX10-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[36:37] offset:128 |
| ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX942-SDAG-LABEL: flat_offset_inbounds_very_wide: |
| ; GFX942-SDAG: ; %bb.0: |
| ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] |
| ; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x5c |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[10:11], v[0:1], 0, s[2:3] |
| ; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x4c |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[12:13], v[0:1], 0, s[2:3] |
| ; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x7c |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[14:15], v[0:1], 0, s[2:3] |
| ; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x6c |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[16:17], v[0:1], 0, s[2:3] |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 28 |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, 60 |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, 44 |
| ; GFX942-SDAG-NEXT: flat_load_dwordx4 v[18:21], v[16:17] |
| ; GFX942-SDAG-NEXT: flat_load_dwordx4 v[22:25], v[12:13] |
| ; GFX942-SDAG-NEXT: flat_load_dwordx4 v[26:29], v[14:15] |
| ; GFX942-SDAG-NEXT: ; kill: killed $vgpr12_vgpr13 |
| ; GFX942-SDAG-NEXT: ; kill: killed $vgpr14_vgpr15 |
| ; GFX942-SDAG-NEXT: ; kill: killed $vgpr16_vgpr17 |
| ; GFX942-SDAG-NEXT: s_nop 0 |
| ; GFX942-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[8:9] |
| ; GFX942-SDAG-NEXT: flat_load_dwordx4 v[30:33], v[10:11] |
| ; GFX942-SDAG-NEXT: flat_load_dwordx4 v[34:37], v[4:5] |
| ; GFX942-SDAG-NEXT: flat_load_dwordx4 v[48:51], v[6:7] |
| ; GFX942-SDAG-NEXT: flat_load_dwordx4 v[52:55], v[0:1] offset:12 |
| ; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0x8c |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] |
| ; GFX942-SDAG-NEXT: flat_load_dwordx4 a[0:3], v[0:1] |
| ; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0x60 |
| ; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x70 |
| ; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0x50 |
| ; GFX942-SDAG-NEXT: s_mov_b64 s[6:7], 0x88 |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, s[0:1] |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, s[2:3] |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[2:3], 0, s[4:5] |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[2:3], 0, 48 |
| ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[10:11], v[2:3], 0, s[6:7] |
| ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[18:21] |
| ; GFX942-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[26:29] |
| ; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[22:25] offset:64 |
| ; GFX942-SDAG-NEXT: flat_store_dwordx4 v[6:7], v[30:33] |
| ; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 |
| ; GFX942-SDAG-NEXT: flat_store_dwordx4 v[8:9], v[48:51] |
| ; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[52:55] |
| ; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[34:37] offset:16 |
| ; GFX942-SDAG-NEXT: flat_store_dword v[10:11], a2 |
| ; GFX942-SDAG-NEXT: flat_store_dwordx2 v[2:3], a[0:1] offset:128 |
| ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-SDAG-LABEL: flat_offset_inbounds_very_wide: |
| ; GFX11-SDAG: ; %bb.0: |
| ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX11-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo |
| ; GFX11-SDAG-NEXT: s_clause 0x7 |
| ; GFX11-SDAG-NEXT: flat_load_b128 v[4:7], v[36:37] offset:80 |
| ; GFX11-SDAG-NEXT: flat_load_b128 v[8:11], v[36:37] offset:96 |
| ; GFX11-SDAG-NEXT: flat_load_b128 v[12:15], v[36:37] offset:64 |
| ; GFX11-SDAG-NEXT: flat_load_b128 v[16:19], v[36:37] offset:32 |
| ; GFX11-SDAG-NEXT: flat_load_b128 v[20:23], v[36:37] offset:16 |
| ; GFX11-SDAG-NEXT: flat_load_b128 v[24:27], v[36:37] |
| ; GFX11-SDAG-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 |
| ; GFX11-SDAG-NEXT: flat_load_b128 v[32:35], v[36:37] offset:112 |
| ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: flat_load_b128 v[35:38], v[36:37] offset:48 |
| ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 |
| ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo |
| ; GFX11-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 |
| ; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo |
| ; GFX11-SDAG-NEXT: s_clause 0x7 |
| ; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[4:7] offset:48 |
| ; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[8:11] offset:64 |
| ; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[12:15] offset:32 |
| ; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[16:19] |
| ; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 |
| ; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[24:27] offset:16 |
| ; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[28:31] |
| ; GFX11-SDAG-NEXT: flat_store_b32 v[48:49], v34 |
| ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) |
| ; GFX11-SDAG-NEXT: s_clause 0x1 |
| ; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[35:38] offset:64 |
| ; GFX11-SDAG-NEXT: flat_store_b64 v[2:3], v[32:33] offset:128 |
| ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-SDAG-LABEL: flat_offset_inbounds_very_wide: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] |
| ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX12-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 |
| ; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo |
| ; GFX12-SDAG-NEXT: s_clause 0x7 |
| ; GFX12-SDAG-NEXT: flat_load_b128 v[4:7], v[36:37] offset:80 |
| ; GFX12-SDAG-NEXT: flat_load_b128 v[8:11], v[36:37] offset:96 |
| ; GFX12-SDAG-NEXT: flat_load_b128 v[12:15], v[36:37] offset:64 |
| ; GFX12-SDAG-NEXT: flat_load_b128 v[16:19], v[36:37] offset:32 |
| ; GFX12-SDAG-NEXT: flat_load_b128 v[20:23], v[36:37] offset:16 |
| ; GFX12-SDAG-NEXT: flat_load_b128 v[24:27], v[36:37] |
| ; GFX12-SDAG-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 |
| ; GFX12-SDAG-NEXT: flat_load_b128 v[32:35], v[36:37] offset:112 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: flat_load_b128 v[35:38], v[36:37] offset:48 |
| ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 |
| ; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo |
| ; GFX12-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 |
| ; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo |
| ; GFX12-SDAG-NEXT: s_clause 0x7 |
| ; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[4:7] offset:48 |
| ; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[8:11] offset:64 |
| ; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[12:15] offset:32 |
| ; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[16:19] |
| ; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 |
| ; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[24:27] offset:16 |
| ; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[28:31] |
| ; GFX12-SDAG-NEXT: flat_store_b32 v[48:49], v34 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x8 |
| ; GFX12-SDAG-NEXT: s_clause 0x1 |
| ; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[35:38] offset:64 |
| ; GFX12-SDAG-NEXT: flat_store_b64 v[2:3], v[32:33] offset:128 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX90A-GISEL-LABEL: flat_offset_inbounds_very_wide: |
| ; GFX90A-GISEL: ; %bb.0: |
| ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 |
| ; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc |
| ; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 |
| ; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 |
| ; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 |
| ; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 |
| ; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 |
| ; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 |
| ; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 |
| ; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 |
| ; GFX90A-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 |
| ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] |
| ; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 |
| ; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 |
| ; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 |
| ; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 |
| ; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 |
| ; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 |
| ; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 |
| ; GFX90A-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 |
| ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-GISEL-LABEL: flat_offset_inbounds_very_wide: |
| ; GFX10-GISEL: ; %bb.0: |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX10-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX10-GISEL-NEXT: s_clause 0x8 |
| ; GFX10-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 |
| ; GFX10-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 |
| ; GFX10-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 |
| ; GFX10-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 |
| ; GFX10-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 |
| ; GFX10-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 |
| ; GFX10-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 |
| ; GFX10-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 |
| ; GFX10-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) |
| ; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) |
| ; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) |
| ; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) |
| ; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) |
| ; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) |
| ; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) |
| ; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) |
| ; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 |
| ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) |
| ; GFX10-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 |
| ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX942-GISEL-LABEL: flat_offset_inbounds_very_wide: |
| ; GFX942-GISEL: ; %bb.0: |
| ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] |
| ; GFX942-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 |
| ; GFX942-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 |
| ; GFX942-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 |
| ; GFX942-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 |
| ; GFX942-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 |
| ; GFX942-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 |
| ; GFX942-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 |
| ; GFX942-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 |
| ; GFX942-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 |
| ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] |
| ; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 |
| ; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 |
| ; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 |
| ; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 |
| ; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 |
| ; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 |
| ; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 |
| ; GFX942-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 |
| ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-GISEL-LABEL: flat_offset_inbounds_very_wide: |
| ; GFX11-GISEL: ; %bb.0: |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] |
| ; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX11-GISEL-NEXT: s_clause 0x8 |
| ; GFX11-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 |
| ; GFX11-GISEL-NEXT: flat_load_b128 v[8:11], v[0:1] offset:28 |
| ; GFX11-GISEL-NEXT: flat_load_b128 v[12:15], v[0:1] offset:44 |
| ; GFX11-GISEL-NEXT: flat_load_b128 v[16:19], v[0:1] offset:60 |
| ; GFX11-GISEL-NEXT: flat_load_b128 v[20:23], v[0:1] offset:76 |
| ; GFX11-GISEL-NEXT: flat_load_b128 v[24:27], v[0:1] offset:92 |
| ; GFX11-GISEL-NEXT: flat_load_b128 v[28:31], v[0:1] offset:108 |
| ; GFX11-GISEL-NEXT: flat_load_b128 v[32:35], v[0:1] offset:124 |
| ; GFX11-GISEL-NEXT: flat_load_b96 v[36:38], v[0:1] offset:140 |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) |
| ; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) |
| ; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[8:11] offset:16 |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) |
| ; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[12:15] offset:32 |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) |
| ; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[16:19] offset:48 |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) |
| ; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[20:23] offset:64 |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) |
| ; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[24:27] offset:80 |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) |
| ; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[28:31] offset:96 |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) |
| ; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[32:35] offset:112 |
| ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) |
| ; GFX11-GISEL-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 |
| ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-GISEL-LABEL: flat_offset_inbounds_very_wide: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 |
| ; GFX12-GISEL-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo |
| ; GFX12-GISEL-NEXT: s_clause 0x8 |
| ; GFX12-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 |
| ; GFX12-GISEL-NEXT: flat_load_b128 v[8:11], v[0:1] offset:28 |
| ; GFX12-GISEL-NEXT: flat_load_b128 v[12:15], v[0:1] offset:44 |
| ; GFX12-GISEL-NEXT: flat_load_b128 v[16:19], v[0:1] offset:60 |
| ; GFX12-GISEL-NEXT: flat_load_b128 v[20:23], v[0:1] offset:76 |
| ; GFX12-GISEL-NEXT: flat_load_b128 v[24:27], v[0:1] offset:92 |
| ; GFX12-GISEL-NEXT: flat_load_b128 v[28:31], v[0:1] offset:108 |
| ; GFX12-GISEL-NEXT: flat_load_b128 v[32:35], v[0:1] offset:124 |
| ; GFX12-GISEL-NEXT: flat_load_b96 v[36:38], v[0:1] offset:140 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x808 |
| ; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x708 |
| ; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[8:11] offset:16 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x608 |
| ; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[12:15] offset:32 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x508 |
| ; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[16:19] offset:48 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x408 |
| ; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[20:23] offset:64 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x308 |
| ; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[24:27] offset:80 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x208 |
| ; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[28:31] offset:96 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x108 |
| ; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[32:35] offset:112 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x8 |
| ; GFX12-GISEL-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] |
| %p.1 = getelementptr inbounds i32, ptr %p, i32 %i |
| %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 |
| %l = load <35 x i32>, ptr %arrayidx |
| store <35 x i32> %l, ptr %pout |
| ret void |
| } |
| ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: |
| ; GFX10-GISEL-FLATSCR: {{.*}} |
| ; GFX10-MUBUF: {{.*}} |
| ; GFX10-SDAG-FLATSCR: {{.*}} |
| ; GFX90A-GISEL-FLATSCR: {{.*}} |
| ; GFX90A-MUBUF: {{.*}} |