blob: af753812b4c0c1c2824100182db16a97a1419b3c [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-SDAG,GFX90A-MUBUF,GFX90A-SDAG-MUBUF %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-SDAG,GFX90A-FLATSCR,GFX90A-SDAG-FLATSCR %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG,GFX10-MUBUF,GFX10-SDAG-MUBUF %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG,GFX10-FLATSCR,GFX10-SDAG-FLATSCR %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-GISEL,GFX90A-MUBUF,GFX90A-GISEL-MUBUF %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-GISEL,GFX90A-FLATSCR,GFX90A-GISEL-FLATSCR %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL,GFX10-MUBUF,GFX10-GISEL-MUBUF %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL,GFX10-FLATSCR,GFX10-GISEL-FLATSCR %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s
; This test checks memory addresses with constant offset components that should
; not be folded into memory accesses with immediate offsets.
; SeparateConstOffsetsFromGEP transforms the GEPs in a way that can lead to
; out-of-bounds or negative intermediate results in the address computation,
; which are problematic for flat and scratch instructions:
; gep[inbounds](p, i + 3) -> gep(gep(p, i), 3)
; The offset here cannot be folded: if %p points to the beginning of scratch or
; scratch or LDS and %i is -1, a folded offset crashes the program.
define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
; GFX90A-LABEL: flat_offset_maybe_oob:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_offset_maybe_oob:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX10-NEXT: flat_load_dword v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-SDAG-LABEL: flat_offset_maybe_oob:
; GFX942-SDAG: ; %bb.0:
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 12
; GFX942-SDAG-NEXT: flat_load_dword v0, v[0:1]
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_offset_maybe_oob:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_offset_maybe_oob:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: flat_load_b32 v0, v[0:1]
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-GISEL-LABEL: flat_offset_maybe_oob:
; GFX942-GISEL: ; %bb.0:
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX942-GISEL-NEXT: flat_load_dword v0, v[0:1]
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
%idx = add nsw i32 %i, 3
%arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx
%l = load i32, ptr %arrayidx
ret i32 %l
}
; For MUBUF and for GFX12, folding the offset is okay.
define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) {
; GFX90A-SDAG-MUBUF-LABEL: private_offset_maybe_oob:
; GFX90A-SDAG-MUBUF: ; %bb.0:
; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-SDAG-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0
; GFX90A-SDAG-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX90A-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-FLATSCR-LABEL: private_offset_maybe_oob:
; GFX90A-FLATSCR: ; %bb.0:
; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX90A-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12
; GFX90A-FLATSCR-NEXT: scratch_load_dword v0, v0, off
; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX90A-FLATSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-MUBUF-LABEL: private_offset_maybe_oob:
; GFX10-SDAG-MUBUF: ; %bb.0:
; GFX10-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0
; GFX10-SDAG-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
; GFX10-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLATSCR-LABEL: private_offset_maybe_oob:
; GFX10-FLATSCR: ; %bb.0:
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12
; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v0, off
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: private_offset_maybe_oob:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX942-NEXT: v_add3_u32 v0, v0, v1, 12
; GFX942-NEXT: scratch_load_dword v0, v0, off
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_offset_maybe_oob:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v0, v0, v1, 12
; GFX11-NEXT: scratch_load_b32 v0, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: private_offset_maybe_oob:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, v0
; GFX12-SDAG-NEXT: scratch_load_b32 v0, v0, off offset:12
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-GISEL-MUBUF-LABEL: private_offset_maybe_oob:
; GFX90A-GISEL-MUBUF: ; %bb.0:
; GFX90A-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-GISEL-MUBUF-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX90A-GISEL-MUBUF-NEXT: v_add_u32_e32 v0, v0, v1
; GFX90A-GISEL-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
; GFX90A-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX90A-GISEL-MUBUF-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-MUBUF-LABEL: private_offset_maybe_oob:
; GFX10-GISEL-MUBUF: ; %bb.0:
; GFX10-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-MUBUF-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-GISEL-MUBUF-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX10-GISEL-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
; GFX10-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-MUBUF-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: private_offset_maybe_oob:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: scratch_load_b32 v0, v0, off offset:12
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%idx = add nsw i32 %i, 3
%arrayidx = getelementptr inbounds i32, ptr addrspace(5) %p, i32 %idx
%l = load i32, ptr addrspace(5) %arrayidx
ret i32 %l
}
; If the GEP that adds the offset is inbounds, folding the offset is legal.
define i32 @flat_offset_inbounds(ptr %p, i32 %i) {
; GFX90A-LABEL: flat_offset_inbounds:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_offset_inbounds:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_offset_inbounds:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_offset_inbounds:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_offset_inbounds:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%p.1 = getelementptr inbounds i32, ptr %p, i32 %i
%arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3
%l = load i32, ptr %arrayidx
ret i32 %l
}
define void @flat_offset_inbounds_wide(ptr %p, ptr %pout, i32 %i) {
; GFX90A-SDAG-LABEL: flat_offset_inbounds_wide:
; GFX90A-SDAG: ; %bb.0:
; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX90A-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v4, vcc, 28, v0
; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-SDAG-NEXT: flat_load_dword v10, v[4:5]
; GFX90A-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[0:1] offset:12
; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-SDAG-NEXT: flat_store_dword v[2:3], v10 offset:16
; GFX90A-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[6:9]
; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: flat_offset_inbounds_wide:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX10-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX10-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28
; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: flat_load_dword v8, v[4:5]
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; GFX10-SDAG-NEXT: flat_store_dword v[2:3], v8 offset:16
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-SDAG-LABEL: flat_offset_inbounds_wide:
; GFX942-SDAG: ; %bb.0:
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1]
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 28
; GFX942-SDAG-NEXT: flat_load_dword v10, v[4:5]
; GFX942-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[0:1] offset:12
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-SDAG-NEXT: flat_store_dword v[2:3], v10 offset:16
; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[6:9]
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: flat_offset_inbounds_wide:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: flat_load_b32 v8, v[4:5]
; GFX11-SDAG-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; GFX11-SDAG-NEXT: flat_store_b32 v[2:3], v8 offset:16
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[4:7]
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: flat_offset_inbounds_wide:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5]
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28
; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: flat_load_b32 v8, v[4:5]
; GFX12-SDAG-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x101
; GFX12-SDAG-NEXT: flat_store_b32 v[2:3], v8 offset:16
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x1
; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[4:7]
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-GISEL-LABEL: flat_offset_inbounds_wide:
; GFX90A-GISEL: ; %bb.0:
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12
; GFX90A-GISEL-NEXT: flat_load_dword v8, v[0:1] offset:28
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX90A-GISEL-NEXT: flat_store_dword v[2:3], v8 offset:16
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: flat_offset_inbounds_wide:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX10-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12
; GFX10-GISEL-NEXT: flat_load_dword v0, v[0:1] offset:28
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
; GFX10-GISEL-NEXT: flat_store_dword v[2:3], v0 offset:16
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-GISEL-LABEL: flat_offset_inbounds_wide:
; GFX942-GISEL: ; %bb.0:
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1]
; GFX942-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12
; GFX942-GISEL-NEXT: flat_load_dword v8, v[0:1] offset:28
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX942-GISEL-NEXT: flat_store_dword v[2:3], v8 offset:16
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: flat_offset_inbounds_wide:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12
; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:28
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
; GFX11-GISEL-NEXT: flat_store_b32 v[2:3], v0 offset:16
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_offset_inbounds_wide:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5]
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-GISEL-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12
; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:28
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x101
; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x1
; GFX12-GISEL-NEXT: flat_store_b32 v[2:3], v0 offset:16
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%p.1 = getelementptr inbounds i32, ptr %p, i32 %i
%arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3
%l = load <5 x i32>, ptr %arrayidx
store <5 x i32> %l, ptr %pout
ret void
}
define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) {
; GFX90A-SDAG-MUBUF-LABEL: flat_offset_inbounds_very_wide:
; GFX90A-SDAG-MUBUF: ; %bb.0:
; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-SDAG-MUBUF-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX90A-SDAG-MUBUF-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v6, s[4:5], 28, v0
; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v7, s[4:5], 0, v1, s[4:5]
; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v8, s[4:5], 44, v0
; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, 0x8c, v0
; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v1, s[4:5]
; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[10:13], v[8:9] offset:16
; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[14:17], v[8:9] offset:32
; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[18:21], v[8:9] offset:48
; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[22:25], v[8:9] offset:64
; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[26:29], v[8:9] offset:80
; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[30:33], v[6:7]
; GFX90A-SDAG-MUBUF-NEXT: s_nop 0
; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[6:9], v[8:9]
; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[34:37], v[0:1] offset:12
; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[48:51], v[4:5]
; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2
; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, 0x88, v2
; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:48
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:64
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[14:17] offset:64
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:32
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[6:9] offset:32
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[30:33] offset:16
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[34:37]
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dword v[4:5], v50
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128
; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-SDAG-FLATSCR-LABEL: flat_offset_inbounds_very_wide:
; GFX90A-SDAG-FLATSCR: ; %bb.0:
; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-SDAG-FLATSCR-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX90A-SDAG-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v6, s[0:1], 28, v0
; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v8, s[0:1], 44, v0
; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, 0x8c, v0
; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[10:13], v[8:9] offset:16
; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[14:17], v[8:9] offset:32
; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[18:21], v[8:9] offset:48
; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[22:25], v[8:9] offset:64
; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[26:29], v[8:9] offset:80
; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[30:33], v[6:7]
; GFX90A-SDAG-FLATSCR-NEXT: s_nop 0
; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[6:9], v[8:9]
; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[34:37], v[0:1] offset:12
; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[48:51], v[4:5]
; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2
; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, 0x88, v2
; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:48
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:64
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[14:17] offset:64
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:32
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[6:9] offset:32
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[30:33] offset:16
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[34:37]
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dword v[4:5], v50
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128
; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-SDAG-FLATSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: flat_offset_inbounds_very_wide:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX10-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX10-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28
; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo
; GFX10-SDAG-NEXT: s_clause 0x8
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[36:37] offset:80
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[8:11], v[36:37] offset:96
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[36:37] offset:48
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[16:19], v[36:37] offset:64
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[20:23], v[36:37] offset:16
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[24:27], v[36:37] offset:32
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[32:35], v[36:37]
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[36:39], v[36:37] offset:112
; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48
; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX10-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2
; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:48
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8)
; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:64
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8)
; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8)
; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:32
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8)
; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8)
; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8)
; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[28:31]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8)
; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8)
; GFX10-SDAG-NEXT: flat_store_dword v[48:49], v38
; GFX10-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[36:37] offset:128
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-SDAG-LABEL: flat_offset_inbounds_very_wide:
; GFX942-SDAG: ; %bb.0:
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1]
; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x5c
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[10:11], v[0:1], 0, s[2:3]
; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x4c
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[12:13], v[0:1], 0, s[2:3]
; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x7c
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[14:15], v[0:1], 0, s[2:3]
; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x6c
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[16:17], v[0:1], 0, s[2:3]
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 28
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, 60
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, 44
; GFX942-SDAG-NEXT: flat_load_dwordx4 v[18:21], v[16:17]
; GFX942-SDAG-NEXT: flat_load_dwordx4 v[22:25], v[12:13]
; GFX942-SDAG-NEXT: flat_load_dwordx4 v[26:29], v[14:15]
; GFX942-SDAG-NEXT: ; kill: killed $vgpr12_vgpr13
; GFX942-SDAG-NEXT: ; kill: killed $vgpr14_vgpr15
; GFX942-SDAG-NEXT: ; kill: killed $vgpr16_vgpr17
; GFX942-SDAG-NEXT: s_nop 0
; GFX942-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[8:9]
; GFX942-SDAG-NEXT: flat_load_dwordx4 v[30:33], v[10:11]
; GFX942-SDAG-NEXT: flat_load_dwordx4 v[34:37], v[4:5]
; GFX942-SDAG-NEXT: flat_load_dwordx4 v[48:51], v[6:7]
; GFX942-SDAG-NEXT: flat_load_dwordx4 v[52:55], v[0:1] offset:12
; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0x8c
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX942-SDAG-NEXT: flat_load_dwordx4 a[0:3], v[0:1]
; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0x60
; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x70
; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0x50
; GFX942-SDAG-NEXT: s_mov_b64 s[6:7], 0x88
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, s[0:1]
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, s[2:3]
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[2:3], 0, s[4:5]
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[2:3], 0, 48
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[10:11], v[2:3], 0, s[6:7]
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[18:21]
; GFX942-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[26:29]
; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[22:25] offset:64
; GFX942-SDAG-NEXT: flat_store_dwordx4 v[6:7], v[30:33]
; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32
; GFX942-SDAG-NEXT: flat_store_dwordx4 v[8:9], v[48:51]
; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[52:55]
; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[34:37] offset:16
; GFX942-SDAG-NEXT: flat_store_dword v[10:11], a2
; GFX942-SDAG-NEXT: flat_store_dwordx2 v[2:3], a[0:1] offset:128
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: flat_offset_inbounds_very_wide:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: s_clause 0x7
; GFX11-SDAG-NEXT: flat_load_b128 v[4:7], v[36:37] offset:80
; GFX11-SDAG-NEXT: flat_load_b128 v[8:11], v[36:37] offset:96
; GFX11-SDAG-NEXT: flat_load_b128 v[12:15], v[36:37] offset:64
; GFX11-SDAG-NEXT: flat_load_b128 v[16:19], v[36:37] offset:32
; GFX11-SDAG-NEXT: flat_load_b128 v[20:23], v[36:37] offset:16
; GFX11-SDAG-NEXT: flat_load_b128 v[24:27], v[36:37]
; GFX11-SDAG-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12
; GFX11-SDAG-NEXT: flat_load_b128 v[32:35], v[36:37] offset:112
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_load_b128 v[35:38], v[36:37] offset:48
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo
; GFX11-SDAG-NEXT: s_clause 0x7
; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[4:7] offset:48
; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[8:11] offset:64
; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[12:15] offset:32
; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[16:19]
; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32
; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[24:27] offset:16
; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[28:31]
; GFX11-SDAG-NEXT: flat_store_b32 v[48:49], v34
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8)
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[35:38] offset:64
; GFX11-SDAG-NEXT: flat_store_b64 v[2:3], v[32:33] offset:128
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: flat_offset_inbounds_very_wide:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5]
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28
; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo
; GFX12-SDAG-NEXT: s_clause 0x7
; GFX12-SDAG-NEXT: flat_load_b128 v[4:7], v[36:37] offset:80
; GFX12-SDAG-NEXT: flat_load_b128 v[8:11], v[36:37] offset:96
; GFX12-SDAG-NEXT: flat_load_b128 v[12:15], v[36:37] offset:64
; GFX12-SDAG-NEXT: flat_load_b128 v[16:19], v[36:37] offset:32
; GFX12-SDAG-NEXT: flat_load_b128 v[20:23], v[36:37] offset:16
; GFX12-SDAG-NEXT: flat_load_b128 v[24:27], v[36:37]
; GFX12-SDAG-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12
; GFX12-SDAG-NEXT: flat_load_b128 v[32:35], v[36:37] offset:112
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_load_b128 v[35:38], v[36:37] offset:48
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48
; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2
; GFX12-SDAG-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo
; GFX12-SDAG-NEXT: s_clause 0x7
; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[4:7] offset:48
; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[8:11] offset:64
; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[12:15] offset:32
; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[16:19]
; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32
; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[24:27] offset:16
; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[28:31]
; GFX12-SDAG-NEXT: flat_store_b32 v[48:49], v34
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x8
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[35:38] offset:64
; GFX12-SDAG-NEXT: flat_store_b64 v[2:3], v[32:33] offset:128
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-GISEL-LABEL: flat_offset_inbounds_very_wide:
; GFX90A-GISEL: ; %bb.0:
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12
; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28
; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44
; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60
; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76
; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92
; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108
; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124
; GFX90A-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16
; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32
; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48
; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64
; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80
; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96
; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112
; GFX90A-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: flat_offset_inbounds_very_wide:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX10-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX10-GISEL-NEXT: s_clause 0x8
; GFX10-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12
; GFX10-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28
; GFX10-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44
; GFX10-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60
; GFX10-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76
; GFX10-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92
; GFX10-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108
; GFX10-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124
; GFX10-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8)
; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8)
; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8)
; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8)
; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8)
; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8)
; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8)
; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8)
; GFX10-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-GISEL-LABEL: flat_offset_inbounds_very_wide:
; GFX942-GISEL: ; %bb.0:
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1]
; GFX942-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12
; GFX942-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28
; GFX942-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44
; GFX942-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60
; GFX942-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76
; GFX942-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92
; GFX942-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108
; GFX942-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124
; GFX942-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16
; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32
; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48
; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64
; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80
; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96
; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112
; GFX942-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: flat_offset_inbounds_very_wide:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX11-GISEL-NEXT: s_clause 0x8
; GFX11-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12
; GFX11-GISEL-NEXT: flat_load_b128 v[8:11], v[0:1] offset:28
; GFX11-GISEL-NEXT: flat_load_b128 v[12:15], v[0:1] offset:44
; GFX11-GISEL-NEXT: flat_load_b128 v[16:19], v[0:1] offset:60
; GFX11-GISEL-NEXT: flat_load_b128 v[20:23], v[0:1] offset:76
; GFX11-GISEL-NEXT: flat_load_b128 v[24:27], v[0:1] offset:92
; GFX11-GISEL-NEXT: flat_load_b128 v[28:31], v[0:1] offset:108
; GFX11-GISEL-NEXT: flat_load_b128 v[32:35], v[0:1] offset:124
; GFX11-GISEL-NEXT: flat_load_b96 v[36:38], v[0:1] offset:140
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8)
; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[8:11] offset:16
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8)
; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[12:15] offset:32
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8)
; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[16:19] offset:48
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8)
; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[20:23] offset:64
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8)
; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[24:27] offset:80
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8)
; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[28:31] offset:96
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8)
; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[32:35] offset:112
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8)
; GFX11-GISEL-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_offset_inbounds_very_wide:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5]
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-GISEL-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-GISEL-NEXT: s_clause 0x8
; GFX12-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12
; GFX12-GISEL-NEXT: flat_load_b128 v[8:11], v[0:1] offset:28
; GFX12-GISEL-NEXT: flat_load_b128 v[12:15], v[0:1] offset:44
; GFX12-GISEL-NEXT: flat_load_b128 v[16:19], v[0:1] offset:60
; GFX12-GISEL-NEXT: flat_load_b128 v[20:23], v[0:1] offset:76
; GFX12-GISEL-NEXT: flat_load_b128 v[24:27], v[0:1] offset:92
; GFX12-GISEL-NEXT: flat_load_b128 v[28:31], v[0:1] offset:108
; GFX12-GISEL-NEXT: flat_load_b128 v[32:35], v[0:1] offset:124
; GFX12-GISEL-NEXT: flat_load_b96 v[36:38], v[0:1] offset:140
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x808
; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x708
; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[8:11] offset:16
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x608
; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[12:15] offset:32
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x508
; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[16:19] offset:48
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x408
; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[20:23] offset:64
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x308
; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[24:27] offset:80
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x208
; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[28:31] offset:96
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x108
; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[32:35] offset:112
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x8
; GFX12-GISEL-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%p.1 = getelementptr inbounds i32, ptr %p, i32 %i
%arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3
%l = load <35 x i32>, ptr %arrayidx
store <35 x i32> %l, ptr %pout
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX10-GISEL-FLATSCR: {{.*}}
; GFX10-MUBUF: {{.*}}
; GFX10-SDAG-FLATSCR: {{.*}}
; GFX90A-GISEL-FLATSCR: {{.*}}
; GFX90A-MUBUF: {{.*}}