| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX900 %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX90A %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s |
| |
| declare i64 @_Z13get_global_idj(i32) #0 |
| |
| define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { |
| ; GFX8-LABEL: clmem_read_simplified: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX8-NEXT: s_mov_b32 s38, -1 |
| ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 |
| ; GFX8-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX8-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX8-NEXT: s_getpc_b64 s[0:1] |
| ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX8-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX8-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NEXT: s_mov_b32 s32, 0 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s35 |
| ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v3, 3 |
| ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x800 |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x1000 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x1800 |
| ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[3:4] |
| ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] |
| ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] |
| ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] |
| ; GFX8-NEXT: s_movk_i32 s0, 0x2000 |
| ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x2800 |
| ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] |
| ; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] |
| ; GFX8-NEXT: s_movk_i32 s0, 0x3000 |
| ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x3800, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] |
| ; GFX8-NEXT: s_waitcnt vmcnt(6) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v11 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v12, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(5) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(4) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(3) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v13, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(2) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v15, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(1) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v17, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v18, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc |
| ; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] |
| ; GFX8-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: clmem_read_simplified: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX9-NEXT: s_mov_b32 s38, -1 |
| ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 |
| ; GFX9-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX9-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[0:1] |
| ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: s_mov_b32 s32, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff8000, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s35 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v18 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 3 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: s_movk_i32 s1, 0x2000 |
| ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off |
| ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048 |
| ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, s1, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 |
| ; GFX9-NEXT: s_movk_i32 s0, 0x1000 |
| ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, s0, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_dwordx2 v[12:13], v[10:11], off offset:2048 |
| ; GFX9-NEXT: global_load_dwordx2 v[14:15], v[6:7], off |
| ; GFX9-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048 |
| ; GFX9-NEXT: s_movk_i32 s0, 0x3000 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off |
| ; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 |
| ; GFX9-NEXT: s_waitcnt vmcnt(6) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v3, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(5) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(4) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v12, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v13, v1, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(3) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(2) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc |
| ; GFX9-NEXT: global_store_dwordx2 v18, v[0:1], s[34:35] |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: clmem_read_simplified: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX10-NEXT: s_mov_b32 s38, -1 |
| ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 |
| ; GFX10-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX10-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX10-NEXT: s_mov_b32 s32, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v1 |
| ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v20 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x1000 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, 0x2000 |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off |
| ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[10:11], v[2:3], off |
| ; GFX10-NEXT: global_load_dwordx2 v[12:13], v[8:9], off offset:-2048 |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x3000 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[14:15], v[8:9], off |
| ; GFX10-NEXT: global_load_dwordx2 v[16:17], v[2:3], off offset:-2048 |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off |
| ; GFX10-NEXT: global_load_dwordx2 v[18:19], v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(6) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(5) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(4) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(3) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v14, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(2) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v16, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(1) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v18, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo |
| ; GFX10-NEXT: global_store_dwordx2 v20, v[0:1], s[34:35] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: clmem_read_simplified: |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 |
| ; GFX11-NEXT: s_mov_b32 s32, 0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v1 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v16 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off |
| ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2048 |
| ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v0, 0x2000 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x1000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b64 v[10:11], v[6:7], off offset:-4096 |
| ; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off offset:2048 |
| ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0x2000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: global_load_b64 v[6:7], v[6:7], off |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: s_clause 0x2 |
| ; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off offset:2048 |
| ; GFX11-NEXT: global_load_b64 v[14:15], v[0:1], off |
| ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 |
| ; GFX11-NEXT: s_waitcnt vmcnt(6) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(5) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(4) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(3) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(2) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo |
| ; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35] |
| ; GFX11-NEXT: s_endpgm |
| entry: |
| %call = tail call i64 @_Z13get_global_idj(i32 0) |
| %conv = and i64 %call, 255 |
| %a0 = shl i64 %call, 7 |
| %idx.ext11 = and i64 %a0, 4294934528 |
| %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 |
| |
| %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv |
| %load1 = load i64, ptr addrspace(1) %addr1, align 8 |
| %addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 256 |
| %load2 = load i64, ptr addrspace(1) %addr2, align 8 |
| %add.1 = add i64 %load2, %load1 |
| |
| %add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 512 |
| %load3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8 |
| %add.2 = add i64 %load3, %add.1 |
| %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 768 |
| %load4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8 |
| %add.3 = add i64 %load4, %add.2 |
| |
| %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1024 |
| %load5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8 |
| %add.4 = add i64 %load5, %add.3 |
| %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1280 |
| %load6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8 |
| %add.5 = add i64 %load6, %add.4 |
| |
| %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1536 |
| %load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8 |
| %add.6 = add i64 %load7, %add.5 |
| %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1792 |
| %load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8 |
| %add.7 = add i64 %load8, %add.6 |
| |
| store i64 %add.7, ptr addrspace(1) %add.ptr12, align 8 |
| ret void |
| } |
| |
| define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { |
| ; GFX8-LABEL: clmem_read: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX8-NEXT: s_mov_b32 s38, -1 |
| ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 |
| ; GFX8-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX8-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX8-NEXT: s_getpc_b64 s[0:1] |
| ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX8-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX8-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NEXT: s_mov_b32 s32, 0 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0 |
| ; GFX8-NEXT: v_and_b32_e32 v12, 0xfe000000, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, 3 |
| ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s35 |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x5000 |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; GFX8-NEXT: v_mov_b32_e32 v10, 0 |
| ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-NEXT: v_mov_b32_e32 v11, 0 |
| ; GFX8-NEXT: s_movk_i32 s0, 0x7f |
| ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader |
| ; GFX8-NEXT: ; =>This Loop Header: Depth=1 |
| ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 |
| ; GFX8-NEXT: v_mov_b32_e32 v3, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX8-NEXT: s_mov_b32 s1, 0 |
| ; GFX8-NEXT: .LBB1_2: ; %for.body |
| ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 |
| ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffb000, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[4:5] |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffb800, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[6:7] |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffc000, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[4:5] |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffd800, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v3, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe000, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v3, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5] |
| ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[19:20] |
| ; GFX8-NEXT: s_addk_i32 s1, 0x2000 |
| ; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff |
| ; GFX8-NEXT: s_waitcnt vmcnt(5) |
| ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v13, v10 |
| ; GFX8-NEXT: v_addc_u32_e32 v24, vcc, v14, v11, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffe800, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v3, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xfffff000, v2 |
| ; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[21:22] |
| ; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] |
| ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v3, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(6) |
| ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v15, v23 |
| ; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v16, v24, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xfffff800, v2 |
| ; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] |
| ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v3, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] |
| ; GFX8-NEXT: s_waitcnt vmcnt(7) |
| ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v17, v21 |
| ; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v18, v22, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[2:3] |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x10000, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(7) |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v21 |
| ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v22, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(6) |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 |
| ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(5) |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(4) |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v19, v4 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v20, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(3) |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v4 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(2) |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(1) |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v17, v4 |
| ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v18, v5, vcc |
| ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 |
| ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit |
| ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; GFX8-NEXT: s_add_i32 s1, s0, -1 |
| ; GFX8-NEXT: s_cmp_eq_u32 s0, 0 |
| ; GFX8-NEXT: s_cbranch_scc1 .LBB1_5 |
| ; GFX8-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 |
| ; GFX8-NEXT: s_mov_b32 s0, s1 |
| ; GFX8-NEXT: s_branch .LBB1_1 |
| ; GFX8-NEXT: .LBB1_5: ; %while.end |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s35 |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v12 |
| ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[10:11] |
| ; GFX8-NEXT: s_endpgm |
| ; |
| ; GFX900-LABEL: clmem_read: |
| ; GFX900: ; %bb.0: ; %entry |
| ; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX900-NEXT: s_mov_b32 s38, -1 |
| ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 |
| ; GFX900-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX900-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX900-NEXT: s_getpc_b64 s[0:1] |
| ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX900-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX900-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX900-NEXT: s_mov_b32 s32, 0 |
| ; GFX900-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 |
| ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 17, v0 |
| ; GFX900-NEXT: v_and_b32_e32 v6, 0xfe000000, v0 |
| ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 3, v6 |
| ; GFX900-NEXT: v_mov_b32_e32 v1, s35 |
| ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX900-NEXT: s_movk_i32 s0, 0x5000 |
| ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 |
| ; GFX900-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX900-NEXT: v_mov_b32_e32 v5, 0 |
| ; GFX900-NEXT: s_movk_i32 s5, 0x7f |
| ; GFX900-NEXT: s_movk_i32 s2, 0xd000 |
| ; GFX900-NEXT: s_movk_i32 s3, 0xe000 |
| ; GFX900-NEXT: s_movk_i32 s4, 0xf000 |
| ; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader |
| ; GFX900-NEXT: ; =>This Loop Header: Depth=1 |
| ; GFX900-NEXT: ; Child Loop BB1_2 Depth 2 |
| ; GFX900-NEXT: v_mov_b32_e32 v3, v1 |
| ; GFX900-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX900-NEXT: s_mov_b32 s6, 0 |
| ; GFX900-NEXT: .LBB1_2: ; %for.body |
| ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 |
| ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 |
| ; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v2 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v3, vcc |
| ; GFX900-NEXT: global_load_dwordx2 v[9:10], v[2:3], off offset:-4096 |
| ; GFX900-NEXT: global_load_dwordx2 v[11:12], v[2:3], off offset:-2048 |
| ; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v2 |
| ; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off |
| ; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc |
| ; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 |
| ; GFX900-NEXT: global_load_dwordx2 v[19:20], v[13:14], off |
| ; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s2, v2 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v3, vcc |
| ; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, s3, v2 |
| ; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc |
| ; GFX900-NEXT: s_addk_i32 s6, 0x2000 |
| ; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff |
| ; GFX900-NEXT: s_waitcnt vmcnt(3) |
| ; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, v7, v4 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc |
| ; GFX900-NEXT: global_load_dwordx2 v[7:8], v[13:14], off offset:-4096 |
| ; GFX900-NEXT: s_waitcnt vmcnt(3) |
| ; GFX900-NEXT: v_add_co_u32_e64 v23, s[0:1], v17, v21 |
| ; GFX900-NEXT: v_addc_co_u32_e64 v24, s[0:1], v18, v5, s[0:1] |
| ; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 |
| ; GFX900-NEXT: global_load_dwordx2 v[21:22], v[13:14], off |
| ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc |
| ; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048 |
| ; GFX900-NEXT: s_waitcnt vmcnt(5) |
| ; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23 |
| ; GFX900-NEXT: global_load_dwordx2 v[13:14], v[2:3], off |
| ; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, v20, v24, vcc |
| ; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX900-NEXT: s_waitcnt vmcnt(5) |
| ; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v15, v19 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, v16, v20, vcc |
| ; GFX900-NEXT: s_waitcnt vmcnt(4) |
| ; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v15 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v16, vcc |
| ; GFX900-NEXT: s_waitcnt vmcnt(3) |
| ; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v17, v7 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v18, v8, vcc |
| ; GFX900-NEXT: s_waitcnt vmcnt(2) |
| ; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v21, v7 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v22, v8, vcc |
| ; GFX900-NEXT: s_waitcnt vmcnt(1) |
| ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v8, vcc |
| ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v10, v5, vcc |
| ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v11, v4 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v12, v5, vcc |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v14, v5, vcc |
| ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 |
| ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit |
| ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; GFX900-NEXT: s_add_i32 s0, s5, -1 |
| ; GFX900-NEXT: s_cmp_eq_u32 s5, 0 |
| ; GFX900-NEXT: s_cbranch_scc1 .LBB1_5 |
| ; GFX900-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 |
| ; GFX900-NEXT: s_mov_b32 s5, s0 |
| ; GFX900-NEXT: s_branch .LBB1_1 |
| ; GFX900-NEXT: .LBB1_5: ; %while.end |
| ; GFX900-NEXT: v_mov_b32_e32 v1, s35 |
| ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v6 |
| ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX900-NEXT: global_store_dwordx2 v[0:1], v[4:5], off |
| ; GFX900-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: clmem_read: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX10-NEXT: s_mov_b32 s38, -1 |
| ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 |
| ; GFX10-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX10-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX10-NEXT: s_mov_b32 s32, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 17, v0 |
| ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0 |
| ; GFX10-NEXT: s_movk_i32 s1, 0x7f |
| ; GFX10-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 |
| ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6 |
| ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, s34 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s35, s0 |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader |
| ; GFX10-NEXT: ; =>This Loop Header: Depth=1 |
| ; GFX10-NEXT: ; Child Loop BB1_2 Depth 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, v1 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, v0 |
| ; GFX10-NEXT: s_mov_b32 s2, 0 |
| ; GFX10-NEXT: .LBB1_2: ; %for.body |
| ; GFX10-NEXT: ; Parent Loop BB1_1 Depth=1 |
| ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 |
| ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v4, 0xffffb800 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v5, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v4, 0xffffc800 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v5, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v4, 0xffffd800 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, -1, v5, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v17, vcc_lo, v4, 0xffffe800 |
| ; GFX10-NEXT: s_clause 0x2 |
| ; GFX10-NEXT: global_load_dwordx2 v[11:12], v[7:8], off offset:-2048 |
| ; GFX10-NEXT: global_load_dwordx2 v[15:16], v[9:10], off offset:-2048 |
| ; GFX10-NEXT: global_load_dwordx2 v[19:20], v[13:14], off offset:-2048 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v5, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v4 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v5, vcc_lo |
| ; GFX10-NEXT: s_clause 0x7 |
| ; GFX10-NEXT: global_load_dwordx2 v[23:24], v[17:18], off offset:-2048 |
| ; GFX10-NEXT: global_load_dwordx2 v[7:8], v[7:8], off |
| ; GFX10-NEXT: global_load_dwordx2 v[9:10], v[9:10], off |
| ; GFX10-NEXT: global_load_dwordx2 v[13:14], v[13:14], off |
| ; GFX10-NEXT: global_load_dwordx2 v[25:26], v[17:18], off |
| ; GFX10-NEXT: global_load_dwordx2 v[27:28], v[21:22], off |
| ; GFX10-NEXT: global_load_dwordx2 v[29:30], v[4:5], off offset:-2048 |
| ; GFX10-NEXT: global_load_dwordx2 v[31:32], v[4:5], off |
| ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo |
| ; GFX10-NEXT: s_addk_i32 s2, 0x2000 |
| ; GFX10-NEXT: s_cmp_gt_u32 s2, 0x3fffff |
| ; GFX10-NEXT: s_waitcnt vmcnt(10) |
| ; GFX10-NEXT: v_add_co_u32 v2, s0, v11, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v12, v3, s0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(6) |
| ; GFX10-NEXT: v_add_co_u32 v2, s0, v7, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v8, v3, s0 |
| ; GFX10-NEXT: v_add_co_u32 v2, s0, v15, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v16, v3, s0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(5) |
| ; GFX10-NEXT: v_add_co_u32 v2, s0, v9, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v10, v3, s0 |
| ; GFX10-NEXT: v_add_co_u32 v2, s0, v19, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v20, v3, s0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(4) |
| ; GFX10-NEXT: v_add_co_u32 v2, s0, v13, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v14, v3, s0 |
| ; GFX10-NEXT: v_add_co_u32 v2, s0, v23, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v24, v3, s0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(3) |
| ; GFX10-NEXT: v_add_co_u32 v2, s0, v25, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v26, v3, s0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(2) |
| ; GFX10-NEXT: v_add_co_u32 v2, s0, v27, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v28, v3, s0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(1) |
| ; GFX10-NEXT: v_add_co_u32 v2, s0, v29, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v30, v3, s0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v31, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v32, v3, vcc_lo |
| ; GFX10-NEXT: s_cbranch_scc0 .LBB1_2 |
| ; GFX10-NEXT: ; %bb.3: ; %while.cond.loopexit |
| ; GFX10-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; GFX10-NEXT: s_add_i32 s0, s1, -1 |
| ; GFX10-NEXT: s_cmp_eq_u32 s1, 0 |
| ; GFX10-NEXT: s_cbranch_scc1 .LBB1_5 |
| ; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 |
| ; GFX10-NEXT: s_mov_b32 s1, s0 |
| ; GFX10-NEXT: s_branch .LBB1_1 |
| ; GFX10-NEXT: .LBB1_5: ; %while.end |
| ; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v6 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0 |
| ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX90A-LABEL: clmem_read: |
| ; GFX90A: ; %bb.0: ; %entry |
| ; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX90A-NEXT: s_mov_b32 s38, -1 |
| ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 |
| ; GFX90A-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX90A-NEXT: s_getpc_b64 s[0:1] |
| ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX90A-NEXT: s_mov_b32 s32, 0 |
| ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX90A-NEXT: v_and_b32_e32 v1, 0xff, v0 |
| ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 17, v0 |
| ; GFX90A-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 |
| ; GFX90A-NEXT: v_lshl_or_b32 v1, v1, 3, v0 |
| ; GFX90A-NEXT: v_mov_b32_e32 v2, s35 |
| ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc |
| ; GFX90A-NEXT: s_movk_i32 s0, 0x5000 |
| ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0 |
| ; GFX90A-NEXT: s_movk_i32 s3, 0x7f |
| ; GFX90A-NEXT: s_movk_i32 s0, 0xd000 |
| ; GFX90A-NEXT: s_movk_i32 s1, 0xe000 |
| ; GFX90A-NEXT: s_movk_i32 s2, 0xf000 |
| ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader |
| ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 |
| ; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2 |
| ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1] |
| ; GFX90A-NEXT: s_mov_b32 s4, 0 |
| ; GFX90A-NEXT: .LBB1_2: ; %for.body |
| ; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1 |
| ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 |
| ; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc |
| ; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 |
| ; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 |
| ; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 |
| ; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc |
| ; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 |
| ; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v6 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v7, vcc |
| ; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 |
| ; GFX90A-NEXT: v_add_co_u32_e32 v20, vcc, s1, v6 |
| ; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[14:15], off |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v7, vcc |
| ; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[20:21], off offset:-4096 |
| ; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[20:21], off offset:-2048 |
| ; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[20:21], off |
| ; GFX90A-NEXT: v_add_co_u32_e32 v22, vcc, s2, v6 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc |
| ; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[22:23], off offset:-2048 |
| ; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off |
| ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc |
| ; GFX90A-NEXT: s_addk_i32 s4, 0x2000 |
| ; GFX90A-NEXT: s_cmp_gt_u32 s4, 0x3fffff |
| ; GFX90A-NEXT: s_waitcnt vmcnt(8) |
| ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v4 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc |
| ; GFX90A-NEXT: s_waitcnt vmcnt(7) |
| ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v18, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v19, v4, vcc |
| ; GFX90A-NEXT: s_waitcnt vmcnt(5) |
| ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v14, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v15, v4, vcc |
| ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v16, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v17, v4, vcc |
| ; GFX90A-NEXT: s_waitcnt vmcnt(4) |
| ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v24, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v25, v4, vcc |
| ; GFX90A-NEXT: s_waitcnt vmcnt(3) |
| ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v26, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v27, v4, vcc |
| ; GFX90A-NEXT: s_waitcnt vmcnt(2) |
| ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v28, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v29, v4, vcc |
| ; GFX90A-NEXT: s_waitcnt vmcnt(1) |
| ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v20, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v21, v4, vcc |
| ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v8, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc |
| ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v10, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v4, vcc |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) |
| ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v1 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc |
| ; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2 |
| ; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit |
| ; GFX90A-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; GFX90A-NEXT: s_add_i32 s4, s3, -1 |
| ; GFX90A-NEXT: s_cmp_eq_u32 s3, 0 |
| ; GFX90A-NEXT: s_cbranch_scc1 .LBB1_5 |
| ; GFX90A-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 |
| ; GFX90A-NEXT: s_mov_b32 s3, s4 |
| ; GFX90A-NEXT: s_branch .LBB1_1 |
| ; GFX90A-NEXT: .LBB1_5: ; %while.end |
| ; GFX90A-NEXT: v_mov_b32_e32 v1, s35 |
| ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 |
| ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off |
| ; GFX90A-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: clmem_read: |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 |
| ; GFX11-NEXT: s_mov_b32 s32, 0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 17, v0 |
| ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xff, v0 |
| ; GFX11-NEXT: s_movk_i32 s1, 0x7f |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 |
| ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 3, v6 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v0, s0, v0, s34 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s35, s0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: .LBB1_1: ; %for.cond.preheader |
| ; GFX11-NEXT: ; =>This Loop Header: Depth=1 |
| ; GFX11-NEXT: ; Child Loop BB1_2 Depth 2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 |
| ; GFX11-NEXT: s_mov_b32 s2, 0 |
| ; GFX11-NEXT: .LBB1_2: ; %for.body |
| ; GFX11-NEXT: ; Parent Loop BB1_1 Depth=1 |
| ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v4, 0xffffc000 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v5, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v9, vcc_lo, 0xffffc000, v4 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v5, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, 0xffffd000, v4 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b64 v[13:14], v[7:8], off offset:-4096 |
| ; GFX11-NEXT: global_load_b64 v[9:10], v[9:10], off offset:-2048 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, -1, v5, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v15, vcc_lo, v4, 0xffffe000 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, -1, v5, vcc_lo |
| ; GFX11-NEXT: global_load_b64 v[11:12], v[11:12], off offset:-2048 |
| ; GFX11-NEXT: v_add_co_u32 v17, vcc_lo, 0xffffe000, v4 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b64 v[19:20], v[15:16], off offset:-4096 |
| ; GFX11-NEXT: global_load_b64 v[7:8], v[7:8], off |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v5, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v4 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v5, vcc_lo |
| ; GFX11-NEXT: s_clause 0x5 |
| ; GFX11-NEXT: global_load_b64 v[17:18], v[17:18], off offset:-2048 |
| ; GFX11-NEXT: global_load_b64 v[15:16], v[15:16], off |
| ; GFX11-NEXT: global_load_b64 v[21:22], v[21:22], off offset:-2048 |
| ; GFX11-NEXT: global_load_b64 v[23:24], v[4:5], off offset:-4096 |
| ; GFX11-NEXT: global_load_b64 v[25:26], v[4:5], off offset:-2048 |
| ; GFX11-NEXT: global_load_b64 v[27:28], v[4:5], off |
| ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo |
| ; GFX11-NEXT: s_addk_i32 s2, 0x2000 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: s_cmp_gt_u32 s2, 0x3fffff |
| ; GFX11-NEXT: s_waitcnt vmcnt(10) |
| ; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v14, v3, s0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(9) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v10, v3, s0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(6) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v2, s0, v7, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v8, v3, s0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v2, s0, v11, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v12, v3, s0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v2, s0, v19, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v20, v3, s0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(5) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v2, s0, v17, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v18, v3, s0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(4) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v2, s0, v15, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v16, v3, s0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(3) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v2, s0, v21, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v22, v3, s0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(2) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v2, s0, v23, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v24, v3, s0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v2, s0, v25, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v26, v3, s0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v27, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v28, v3, vcc_lo |
| ; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 |
| ; GFX11-NEXT: ; %bb.3: ; %while.cond.loopexit |
| ; GFX11-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; GFX11-NEXT: s_add_i32 s0, s1, -1 |
| ; GFX11-NEXT: s_cmp_eq_u32 s1, 0 |
| ; GFX11-NEXT: s_cbranch_scc1 .LBB1_5 |
| ; GFX11-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 |
| ; GFX11-NEXT: s_mov_b32 s1, s0 |
| ; GFX11-NEXT: s_branch .LBB1_1 |
| ; GFX11-NEXT: .LBB1_5: ; %while.end |
| ; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v6 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0 |
| ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off |
| ; GFX11-NEXT: s_endpgm |
| entry: |
| %call = tail call i64 @_Z13get_global_idj(i32 0) |
| %conv = and i64 %call, 255 |
| %a0 = shl i64 %call, 17 |
| %idx.ext11 = and i64 %a0, 4261412864 |
| %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 |
| %add.ptr6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv |
| br label %for.cond.preheader |
| |
| while.cond.loopexit: ; preds = %for.body |
| %dec = add nsw i32 %dec31, -1 |
| %tobool = icmp eq i32 %dec31, 0 |
| br i1 %tobool, label %while.end, label %for.cond.preheader |
| |
| for.cond.preheader: ; preds = %entry, %while.cond.loopexit |
| %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ] |
| %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ] |
| br label %for.body |
| |
| for.body: ; preds = %for.body, %for.cond.preheader |
| %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ] |
| %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ] |
| %conv3 = zext i32 %block.029 to i64 |
| %add.ptr8 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3 |
| %load1 = load i64, ptr addrspace(1) %add.ptr8, align 8 |
| %add = add i64 %load1, %sum.128 |
| |
| %add9 = or disjoint i32 %block.029, 256 |
| %conv3.1 = zext i32 %add9 to i64 |
| %add.ptr8.1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.1 |
| %load2 = load i64, ptr addrspace(1) %add.ptr8.1, align 8 |
| %add.1 = add i64 %load2, %add |
| |
| %add9.1 = or disjoint i32 %block.029, 512 |
| %conv3.2 = zext i32 %add9.1 to i64 |
| %add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.2 |
| %l3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8 |
| %add.2 = add i64 %l3, %add.1 |
| |
| %add9.2 = or disjoint i32 %block.029, 768 |
| %conv3.3 = zext i32 %add9.2 to i64 |
| %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.3 |
| %l4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8 |
| %add.3 = add i64 %l4, %add.2 |
| |
| %add9.3 = or disjoint i32 %block.029, 1024 |
| %conv3.4 = zext i32 %add9.3 to i64 |
| %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.4 |
| %l5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8 |
| %add.4 = add i64 %l5, %add.3 |
| |
| %add9.4 = or disjoint i32 %block.029, 1280 |
| %conv3.5 = zext i32 %add9.4 to i64 |
| %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.5 |
| %l6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8 |
| %add.5 = add i64 %l6, %add.4 |
| |
| %add9.5 = or disjoint i32 %block.029, 1536 |
| %conv3.6 = zext i32 %add9.5 to i64 |
| %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.6 |
| %load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8 |
| %add.6 = add i64 %load7, %add.5 |
| |
| %add9.6 = or disjoint i32 %block.029, 1792 |
| %conv3.7 = zext i32 %add9.6 to i64 |
| %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.7 |
| %load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8 |
| %add.7 = add i64 %load8, %add.6 |
| |
| %add9.7 = or disjoint i32 %block.029, 2048 |
| %conv3.8 = zext i32 %add9.7 to i64 |
| %add.ptr8.8 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.8 |
| %load9 = load i64, ptr addrspace(1) %add.ptr8.8, align 8 |
| %add.8 = add i64 %load9, %add.7 |
| |
| %add9.8 = or disjoint i32 %block.029, 2304 |
| %conv3.9 = zext i32 %add9.8 to i64 |
| %add.ptr8.9 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.9 |
| %load10 = load i64, ptr addrspace(1) %add.ptr8.9, align 8 |
| %add.9 = add i64 %load10, %add.8 |
| |
| %add9.9 = or disjoint i32 %block.029, 2560 |
| %conv3.10 = zext i32 %add9.9 to i64 |
| %add.ptr8.10 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.10 |
| %load11 = load i64, ptr addrspace(1) %add.ptr8.10, align 8 |
| %add.10 = add i64 %load11, %add.9 |
| |
| %add9.31 = add nuw nsw i32 %block.029, 8192 |
| %cmp.31 = icmp ult i32 %add9.31, 4194304 |
| br i1 %cmp.31, label %for.body, label %while.cond.loopexit |
| |
| while.end: ; preds = %while.cond.loopexit |
| store i64 %add.10, ptr addrspace(1) %add.ptr12, align 8 |
| ret void |
| } |
| |
| ; using 32bit address. |
| define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { |
| ; GFX8-LABEL: Address32: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX8-NEXT: s_mov_b32 s38, -1 |
| ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 |
| ; GFX8-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX8-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX8-NEXT: s_getpc_b64 s[0:1] |
| ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX8-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX8-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NEXT: s_mov_b32 s32, 0 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s35 |
| ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v3, 2 |
| ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x400 |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x800 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0xc00 |
| ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x1000 |
| ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x1400 |
| ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x1800 |
| ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x1c00 |
| ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x2000 |
| ; GFX8-NEXT: flat_load_dword v0, v[3:4] |
| ; GFX8-NEXT: flat_load_dword v19, v[5:6] |
| ; GFX8-NEXT: flat_load_dword v7, v[7:8] |
| ; GFX8-NEXT: flat_load_dword v8, v[9:10] |
| ; GFX8-NEXT: flat_load_dword v9, v[11:12] |
| ; GFX8-NEXT: flat_load_dword v10, v[13:14] |
| ; GFX8-NEXT: flat_load_dword v11, v[15:16] |
| ; GFX8-NEXT: flat_load_dword v12, v[17:18] |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x2400, v3 |
| ; GFX8-NEXT: flat_load_dword v5, v[5:6] |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dword v3, v[3:4] |
| ; GFX8-NEXT: s_waitcnt vmcnt(8) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(7) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(6) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(5) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(4) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v10, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(3) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v11, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(2) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v12, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(1) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 |
| ; GFX8-NEXT: flat_store_dword v[1:2], v0 |
| ; GFX8-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: Address32: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX9-NEXT: s_mov_b32 s38, -1 |
| ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 |
| ; GFX9-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX9-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[0:1] |
| ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: s_mov_b32 s32, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s35 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v4 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: s_movk_i32 s0, 0x1000 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_dword v5, v[0:1], off |
| ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:1024 |
| ; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:2048 |
| ; GFX9-NEXT: global_load_dword v8, v[0:1], off offset:3072 |
| ; GFX9-NEXT: global_load_dword v9, v[2:3], off |
| ; GFX9-NEXT: global_load_dword v10, v[2:3], off offset:1024 |
| ; GFX9-NEXT: global_load_dword v11, v[2:3], off offset:2048 |
| ; GFX9-NEXT: global_load_dword v12, v[2:3], off offset:3072 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_dword v2, v[0:1], off |
| ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:1024 |
| ; GFX9-NEXT: s_waitcnt vmcnt(8) |
| ; GFX9-NEXT: v_add_u32_e32 v0, v6, v5 |
| ; GFX9-NEXT: s_waitcnt vmcnt(6) |
| ; GFX9-NEXT: v_add3_u32 v0, v7, v0, v8 |
| ; GFX9-NEXT: s_waitcnt vmcnt(4) |
| ; GFX9-NEXT: v_add3_u32 v0, v9, v0, v10 |
| ; GFX9-NEXT: s_waitcnt vmcnt(2) |
| ; GFX9-NEXT: v_add3_u32 v0, v11, v0, v12 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_add3_u32 v0, v2, v0, v3 |
| ; GFX9-NEXT: global_store_dword v4, v0, s[34:35] |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: Address32: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX10-NEXT: s_mov_b32 s38, -1 |
| ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 |
| ; GFX10-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX10-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX10-NEXT: s_mov_b32 s32, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 2 |
| ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 |
| ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x1000 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0x1000, v0 |
| ; GFX10-NEXT: s_clause 0x4 |
| ; GFX10-NEXT: global_load_dword v9, v[0:1], off |
| ; GFX10-NEXT: global_load_dword v10, v[0:1], off offset:1024 |
| ; GFX10-NEXT: global_load_dword v11, v[2:3], off offset:1024 |
| ; GFX10-NEXT: global_load_dword v12, v[4:5], off offset:-2048 |
| ; GFX10-NEXT: global_load_dword v13, v[4:5], off |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x1800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x2000 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dword v14, v[6:7], off offset:1024 |
| ; GFX10-NEXT: global_load_dword v15, v[2:3], off offset:1024 |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: s_clause 0x2 |
| ; GFX10-NEXT: global_load_dword v2, v[4:5], off offset:-2048 |
| ; GFX10-NEXT: global_load_dword v3, v[4:5], off |
| ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:1024 |
| ; GFX10-NEXT: s_waitcnt vmcnt(8) |
| ; GFX10-NEXT: v_add_nc_u32_e32 v0, v10, v9 |
| ; GFX10-NEXT: s_waitcnt vmcnt(6) |
| ; GFX10-NEXT: v_add3_u32 v0, v12, v0, v11 |
| ; GFX10-NEXT: s_waitcnt vmcnt(4) |
| ; GFX10-NEXT: v_add3_u32 v0, v13, v0, v14 |
| ; GFX10-NEXT: s_waitcnt vmcnt(2) |
| ; GFX10-NEXT: v_add3_u32 v0, v2, v0, v15 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_add3_u32 v0, v3, v0, v6 |
| ; GFX10-NEXT: global_store_dword v8, v0, s[34:35] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: Address32: |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 |
| ; GFX11-NEXT: s_mov_b32 s32, 0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v1 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v6 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b32 v7, v[0:1], off |
| ; GFX11-NEXT: global_load_b32 v8, v[0:1], off offset:1024 |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x2000 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: s_clause 0x5 |
| ; GFX11-NEXT: global_load_b32 v9, v[0:1], off offset:2048 |
| ; GFX11-NEXT: global_load_b32 v10, v[0:1], off offset:3072 |
| ; GFX11-NEXT: global_load_b32 v11, v[4:5], off offset:-4096 |
| ; GFX11-NEXT: global_load_b32 v12, v[2:3], off offset:1024 |
| ; GFX11-NEXT: global_load_b32 v13, v[2:3], off offset:2048 |
| ; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:3072 |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b32 v3, v[4:5], off |
| ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:1024 |
| ; GFX11-NEXT: s_waitcnt vmcnt(8) |
| ; GFX11-NEXT: v_add_nc_u32_e32 v1, v8, v7 |
| ; GFX11-NEXT: s_waitcnt vmcnt(6) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add3_u32 v1, v9, v1, v10 |
| ; GFX11-NEXT: s_waitcnt vmcnt(4) |
| ; GFX11-NEXT: v_add3_u32 v1, v11, v1, v12 |
| ; GFX11-NEXT: s_waitcnt vmcnt(2) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add3_u32 v1, v13, v1, v2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_add3_u32 v0, v3, v1, v0 |
| ; GFX11-NEXT: global_store_b32 v6, v0, s[34:35] |
| ; GFX11-NEXT: s_endpgm |
| entry: |
| %call = tail call i64 @_Z13get_global_idj(i32 0) |
| %conv = and i64 %call, 255 |
| %id = shl i64 %call, 7 |
| %idx.ext11 = and i64 %id, 4294934528 |
| %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 |
| |
| %add.ptr6 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr12, i64 %conv |
| %load1 = load i32, ptr addrspace(1) %add.ptr6, align 4 |
| |
| %add.ptr8.1 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 256 |
| %load2 = load i32, ptr addrspace(1) %add.ptr8.1, align 4 |
| %add.1 = add i32 %load2, %load1 |
| |
| %add.ptr8.2 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 512 |
| %load3 = load i32, ptr addrspace(1) %add.ptr8.2, align 4 |
| %add.2 = add i32 %load3, %add.1 |
| |
| %add.ptr8.3 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 768 |
| %load4 = load i32, ptr addrspace(1) %add.ptr8.3, align 4 |
| %add.3 = add i32 %load4, %add.2 |
| |
| %add.ptr8.4 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1024 |
| %load5 = load i32, ptr addrspace(1) %add.ptr8.4, align 4 |
| %add.4 = add i32 %load5, %add.3 |
| |
| %add.ptr8.5 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1280 |
| %load6 = load i32, ptr addrspace(1) %add.ptr8.5, align 4 |
| %add.5 = add i32 %load6, %add.4 |
| |
| %add.ptr8.6 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1536 |
| %load7 = load i32, ptr addrspace(1) %add.ptr8.6, align 4 |
| %add.6 = add i32 %load7, %add.5 |
| |
| %add.ptr8.7 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1792 |
| %load8 = load i32, ptr addrspace(1) %add.ptr8.7, align 4 |
| %add.7 = add i32 %load8, %add.6 |
| |
| %add.ptr8.8 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 2048 |
| %load9 = load i32, ptr addrspace(1) %add.ptr8.8, align 4 |
| %add.8 = add i32 %load9, %add.7 |
| |
| %add.ptr8.9 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 2304 |
| %load10 = load i32, ptr addrspace(1) %add.ptr8.9, align 4 |
| %add.9 = add i32 %load10, %add.8 |
| |
| store i32 %add.9, ptr addrspace(1) %add.ptr12, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { |
| ; GFX8-LABEL: Offset64: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX8-NEXT: s_mov_b32 s38, -1 |
| ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 |
| ; GFX8-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX8-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX8-NEXT: s_getpc_b64 s[0:1] |
| ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX8-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX8-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NEXT: s_mov_b32 s32, 0 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s35 |
| ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v3, 3 |
| ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0xf000 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0xf800 |
| ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[3:4] |
| ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] |
| ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v4 |
| ; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] |
| ; GFX8-NEXT: s_waitcnt vmcnt(2) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v5 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(1) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc |
| ; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] |
| ; GFX8-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: Offset64: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX9-NEXT: s_mov_b32 s38, -1 |
| ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 |
| ; GFX9-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX9-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[0:1] |
| ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: s_mov_b32 s32, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff8000, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s35 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v10 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 3 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: s_movk_i32 s0, 0xf000 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off |
| ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off |
| ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048 |
| ; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(2) |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v3, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc |
| ; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[34:35] |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: Offset64: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX10-NEXT: s_mov_b32 s38, -1 |
| ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 |
| ; GFX10-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX10-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX10-NEXT: s_mov_b32 s32, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff8000, v1 |
| ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v12 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0xfffff800 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off |
| ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off |
| ; GFX10-NEXT: global_load_dwordx2 v[10:11], v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(2) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(1) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo |
| ; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: Offset64: |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 |
| ; GFX11-NEXT: s_mov_b32 s32, 0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v8 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0xfffff000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: s_clause 0x2 |
| ; GFX11-NEXT: global_load_b64 v[4:5], v[2:3], off |
| ; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off |
| ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:2048 |
| ; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v1 |
| ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(2) |
| ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo |
| ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35] |
| ; GFX11-NEXT: s_endpgm |
| entry: |
| %call = tail call i64 @_Z13get_global_idj(i32 0) |
| %conv = and i64 %call, 255 |
| %a0 = shl i64 %call, 7 |
| %idx.ext11 = and i64 %a0, 4294934528 |
| %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 |
| |
| %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv |
| %load1 = load i64, ptr addrspace(1) %addr1, align 8 |
| |
| %addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870400 |
| %load2 = load i64, ptr addrspace(1) %addr2, align 8 |
| |
| %add1 = add i64 %load2, %load1 |
| |
| %addr3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870656 |
| %load3 = load i64, ptr addrspace(1) %addr3, align 8 |
| |
| %add2 = add i64 %load3, %add1 |
| |
| %addr4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870912 |
| %load4 = load i64, ptr addrspace(1) %addr4, align 8 |
| %add4 = add i64 %load4, %add2 |
| |
| store i64 %add4, ptr addrspace(1) %add.ptr12, align 8 |
| ret void |
| } |
| |
| ; TODO: Support load4 as anchor instruction. |
| define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { |
| ; GFX8-LABEL: p32Offset64: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX8-NEXT: s_mov_b32 s38, -1 |
| ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 |
| ; GFX8-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX8-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX8-NEXT: s_getpc_b64 s[0:1] |
| ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX8-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX8-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NEXT: s_mov_b32 s32, 0 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s35 |
| ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v3, 2 |
| ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc |
| ; GFX8-NEXT: s_mov_b32 s0, 0x7ffff800 |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_mov_b32 s0, 0x7ffffc00 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dword v0, v[3:4] |
| ; GFX8-NEXT: flat_load_dword v5, v[5:6] |
| ; GFX8-NEXT: flat_load_dword v6, v[7:8] |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dword v3, v[3:4] |
| ; GFX8-NEXT: s_waitcnt vmcnt(2) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(1) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 |
| ; GFX8-NEXT: flat_store_dword v[1:2], v0 |
| ; GFX8-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: p32Offset64: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX9-NEXT: s_mov_b32 s38, -1 |
| ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 |
| ; GFX9-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX9-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[0:1] |
| ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: s_mov_b32 s32, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff8000, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s35 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v6 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: s_mov_b32 s0, 0x7ffff000 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0x80000000, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_dword v7, v[0:1], off |
| ; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:2048 |
| ; GFX9-NEXT: global_load_dword v9, v[2:3], off offset:3072 |
| ; GFX9-NEXT: global_load_dword v10, v[4:5], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(2) |
| ; GFX9-NEXT: v_add_u32_e32 v0, v8, v7 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_add3_u32 v0, v9, v0, v10 |
| ; GFX9-NEXT: global_store_dword v6, v0, s[34:35] |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: p32Offset64: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX10-NEXT: s_mov_b32 s38, -1 |
| ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 |
| ; GFX10-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX10-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX10-NEXT: s_mov_b32 s32, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 2 |
| ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 |
| ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v4 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x80000000 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: global_load_dword v5, v[0:1], off |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ffff800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: s_clause 0x2 |
| ; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:-2048 |
| ; GFX10-NEXT: global_load_dword v7, v[2:3], off |
| ; GFX10-NEXT: global_load_dword v8, v[0:1], off offset:1024 |
| ; GFX10-NEXT: s_waitcnt vmcnt(2) |
| ; GFX10-NEXT: v_add_nc_u32_e32 v0, v6, v5 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_add3_u32 v0, v8, v0, v7 |
| ; GFX10-NEXT: global_store_dword v4, v0, s[34:35] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: p32Offset64: |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 |
| ; GFX11-NEXT: s_mov_b32 s32, 0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v1 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v6 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x7ffff000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x80000000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: s_clause 0x3 |
| ; GFX11-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:2048 |
| ; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:3072 |
| ; GFX11-NEXT: global_load_b32 v3, v[4:5], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(2) |
| ; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add3_u32 v0, v2, v0, v3 |
| ; GFX11-NEXT: global_store_b32 v6, v0, s[34:35] |
| ; GFX11-NEXT: s_endpgm |
| entry: |
| %call = tail call i64 @_Z13get_global_idj(i32 0) |
| %conv = and i64 %call, 255 |
| %a0 = shl i64 %call, 7 |
| %idx.ext11 = and i64 %a0, 4294934528 |
| %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 |
| |
| %addr1 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr12, i64 %conv |
| %load1 = load i32, ptr addrspace(1) %addr1, align 8 |
| |
| %addr2 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870400 |
| %load2 = load i32, ptr addrspace(1) %addr2, align 8 |
| |
| %add1 = add i32 %load2, %load1 |
| |
| %addr3 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870656 |
| %load3 = load i32, ptr addrspace(1) %addr3, align 8 |
| |
| %add2 = add i32 %load3, %add1 |
| |
| %addr4 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870912 |
| %load4 = load i32, ptr addrspace(1) %addr4, align 8 |
| %add4 = add i32 %load4, %add2 |
| |
| store i32 %add4, ptr addrspace(1) %add.ptr12, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, |
| ; GFX8-LABEL: DiffBase: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 |
| ; GFX8-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 |
| ; GFX8-NEXT: s_mov_b32 s42, -1 |
| ; GFX8-NEXT: s_mov_b32 s43, 0xe80000 |
| ; GFX8-NEXT: s_add_u32 s40, s40, s11 |
| ; GFX8-NEXT: s_addc_u32 s41, s41, 0 |
| ; GFX8-NEXT: s_getpc_b64 s[0:1] |
| ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41] |
| ; GFX8-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43] |
| ; GFX8-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NEXT: s_mov_b32 s32, 0 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 |
| ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s37 |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s36, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-NEXT: v_mov_b32_e32 v3, s39 |
| ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s38, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x1000, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x1800, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x2000, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x2800, v12 |
| ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] |
| ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[4:5] |
| ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] |
| ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x3000, v12 |
| ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v13, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] |
| ; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] |
| ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0x3800, v12 |
| ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] |
| ; GFX8-NEXT: s_waitcnt vmcnt(4) |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(3) |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v7, v3, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(1) |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v8 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 |
| ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc |
| ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] |
| ; GFX8-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: DiffBase: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 |
| ; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 |
| ; GFX9-NEXT: s_mov_b32 s42, -1 |
| ; GFX9-NEXT: s_mov_b32 s43, 0xe00000 |
| ; GFX9-NEXT: s_add_u32 s40, s40, s11 |
| ; GFX9-NEXT: s_addc_u32 s41, s41, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[0:1] |
| ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] |
| ; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: s_mov_b32 s32, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0 |
| ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff8000, v0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s37 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s36, v16 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v0, vcc |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s39 |
| ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, s38, v16 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v0, vcc |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x2000, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off |
| ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2048 |
| ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v10 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x3000, v10 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v11, vcc |
| ; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 |
| ; GFX9-NEXT: global_load_dwordx2 v[12:13], v[2:3], off |
| ; GFX9-NEXT: global_load_dwordx2 v[14:15], v[2:3], off offset:2048 |
| ; GFX9-NEXT: s_waitcnt vmcnt(4) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(3) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v10 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v13, v11, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v15, v3, vcc |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc |
| ; GFX9-NEXT: global_store_dwordx2 v16, v[0:1], s[36:37] |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: DiffBase: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 |
| ; GFX10-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 |
| ; GFX10-NEXT: s_mov_b32 s42, -1 |
| ; GFX10-NEXT: s_mov_b32 s43, 0x31c16000 |
| ; GFX10-NEXT: s_add_u32 s40, s40, s11 |
| ; GFX10-NEXT: s_addc_u32 s41, s41, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 |
| ; GFX10-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_mov_b64 s[0:1], s[40:41] |
| ; GFX10-NEXT: s_mov_b64 s[2:3], s[42:43] |
| ; GFX10-NEXT: s_mov_b32 s32, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 7, v0 |
| ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff8000, v0 |
| ; GFX10-NEXT: v_add_co_u32 v8, s0, s36, v16 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s0, s37, 0, s0 |
| ; GFX10-NEXT: v_add_co_u32 v12, s0, s38, v16 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v13, s0, s39, 0, s0 |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, 0x1800 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v12, 0x3000 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 |
| ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v8 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:-2048 |
| ; GFX10-NEXT: global_load_dwordx2 v[10:11], v[2:3], off |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x3800, v12 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo |
| ; GFX10-NEXT: global_load_dwordx2 v[12:13], v[0:1], off |
| ; GFX10-NEXT: global_load_dwordx2 v[14:15], v[2:3], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(4) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(2) |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v10, v8 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v11, v9, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(1) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo |
| ; GFX10-NEXT: global_store_dwordx2 v16, v[0:1], s[36:37] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: DiffBase: |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 |
| ; GFX11-NEXT: s_mov_b32 s32, 0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 7, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff8000, v0 |
| ; GFX11-NEXT: v_add_co_u32 v2, s0, s36, v12 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s37, 0, s0 |
| ; GFX11-NEXT: v_add_co_u32 v8, s0, s38, v12 |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, s39, 0, s0 |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0x2000 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x2000, v8 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x3000, v8 |
| ; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off offset:-4096 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo |
| ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off offset:2048 |
| ; GFX11-NEXT: global_load_b64 v[10:11], v[8:9], off |
| ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off |
| ; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off offset:2048 |
| ; GFX11-NEXT: s_waitcnt vmcnt(4) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(2) |
| ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v10, v4 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v11, v5, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo |
| ; GFX11-NEXT: global_store_b64 v12, v[0:1], s[36:37] |
| ; GFX11-NEXT: s_endpgm |
| ptr addrspace(1) %buffer2) { |
| entry: |
| %call = tail call i64 @_Z13get_global_idj(i32 0) |
| %conv = and i64 %call, 255 |
| %a0 = shl i64 %call, 7 |
| %idx.ext11 = and i64 %a0, 4294934528 |
| %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer1, i64 %idx.ext11 |
| |
| %add.ptr2 = getelementptr inbounds i8, ptr addrspace(1) %buffer2, i64 %idx.ext11 |
| |
| %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 512 |
| %load1 = load i64, ptr addrspace(1) %addr1, align 8 |
| %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 768 |
| %load2 = load i64, ptr addrspace(1) %add.ptr8.3, align 8 |
| %add1 = add i64 %load2, %load1 |
| %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 1024 |
| %load3 = load i64, ptr addrspace(1) %add.ptr8.4, align 8 |
| %add2 = add i64 %load3, %add1 |
| |
| %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1280 |
| %load4 = load i64, ptr addrspace(1) %add.ptr8.5, align 8 |
| |
| %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1536 |
| %load5 = load i64, ptr addrspace(1) %add.ptr8.6, align 8 |
| %add3 = add i64 %load5, %load4 |
| |
| %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1792 |
| %load6 = load i64, ptr addrspace(1) %add.ptr8.7, align 8 |
| %add4 = add i64 %load6, %add3 |
| |
| %add5 = add i64 %add2, %add4 |
| |
| store i64 %add5, ptr addrspace(1) %add.ptr12, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { |
| ; GFX8-LABEL: ReverseOrder: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX8-NEXT: s_mov_b32 s38, -1 |
| ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 |
| ; GFX8-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX8-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX8-NEXT: s_getpc_b64 s[0:1] |
| ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX8-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX8-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NEXT: s_mov_b32 s32, 0 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s35 |
| ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v3, 3 |
| ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x3800 |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x3000 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x2800 |
| ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[3:4] |
| ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] |
| ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] |
| ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] |
| ; GFX8-NEXT: s_movk_i32 s0, 0x2000 |
| ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x1800 |
| ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] |
| ; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] |
| ; GFX8-NEXT: s_movk_i32 s0, 0x1000 |
| ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x800, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc |
| ; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] |
| ; GFX8-NEXT: s_waitcnt vmcnt(6) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v11 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v12, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(5) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(4) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(3) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v13, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(2) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v15, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(1) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v17, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v18, v5, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc |
| ; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] |
| ; GFX8-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: ReverseOrder: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX9-NEXT: s_mov_b32 s38, -1 |
| ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 |
| ; GFX9-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX9-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[0:1] |
| ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: s_mov_b32 s32, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff8000, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s35 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v22 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 3 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: s_movk_i32 s0, 0x3000 |
| ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 |
| ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off |
| ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:2048 |
| ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off |
| ; GFX9-NEXT: s_movk_i32 s0, 0x2000 |
| ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:2048 |
| ; GFX9-NEXT: s_movk_i32 s0, 0x1000 |
| ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_dwordx2 v[14:15], v[12:13], off |
| ; GFX9-NEXT: global_load_dwordx2 v[16:17], v[4:5], off |
| ; GFX9-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048 |
| ; GFX9-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048 |
| ; GFX9-NEXT: s_waitcnt vmcnt(6) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(5) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(4) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(2) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v20, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v21, v1, vcc |
| ; GFX9-NEXT: global_store_dwordx2 v22, v[0:1], s[34:35] |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: ReverseOrder: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX10-NEXT: s_mov_b32 s38, -1 |
| ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 |
| ; GFX10-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX10-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX10-NEXT: s_mov_b32 s32, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v1 |
| ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v20 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x3800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x3000, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off |
| ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x2800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, 0x2000, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, 0x1800, v0 |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[4:5], off |
| ; GFX10-NEXT: global_load_dwordx2 v[10:11], v[10:11], off |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, 0x1000, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[12:13], v[12:13], off |
| ; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[16:17], v[14:15], off |
| ; GFX10-NEXT: global_load_dwordx2 v[18:19], v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(6) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v6 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(5) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(2) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(1) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v16, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v18, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo |
| ; GFX10-NEXT: global_store_dwordx2 v20, v[0:1], s[34:35] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: ReverseOrder: |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 |
| ; GFX11-NEXT: s_mov_b32 s32, 0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v1 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v16 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x3000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x2000, v0 |
| ; GFX11-NEXT: s_clause 0x2 |
| ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off |
| ; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off offset:2048 |
| ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0x1000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo |
| ; GFX11-NEXT: s_clause 0x4 |
| ; GFX11-NEXT: global_load_b64 v[12:13], v[8:9], off offset:2048 |
| ; GFX11-NEXT: global_load_b64 v[14:15], v[10:11], off |
| ; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off |
| ; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:2048 |
| ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 |
| ; GFX11-NEXT: s_waitcnt vmcnt(6) |
| ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(5) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(4) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(2) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo |
| ; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35] |
| ; GFX11-NEXT: s_endpgm |
| entry: |
| %call = tail call i64 @_Z13get_global_idj(i32 0) |
| %conv = and i64 %call, 255 |
| %a0 = shl i64 %call, 7 |
| %idx.ext11 = and i64 %a0, 4294934528 |
| %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 |
| |
| %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv |
| %load1 = load i64, ptr addrspace(1) %addr1, align 8 |
| |
| %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1792 |
| %load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8 |
| %add7 = add i64 %load8, %load1 |
| |
| %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1536 |
| %load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8 |
| %add6 = add i64 %load7, %add7 |
| |
| %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1280 |
| %load6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8 |
| %add5 = add i64 %load6, %add6 |
| |
| %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1024 |
| %load5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8 |
| %add4 = add i64 %load5, %add5 |
| |
| %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 768 |
| %load4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8 |
| %add3 = add i64 %load4, %add4 |
| |
| %add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 512 |
| %load3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8 |
| %add2 = add i64 %load3, %add3 |
| |
| %addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 256 |
| %load2 = load i64, ptr addrspace(1) %addr2, align 8 |
| %add1 = add i64 %load2, %add2 |
| |
| store i64 %add1, ptr addrspace(1) %add.ptr12, align 8 |
| ret void |
| } |
| |
| define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buffer) { |
| ; GFX8-LABEL: negativeoffset: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX8-NEXT: s_mov_b32 s38, -1 |
| ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 |
| ; GFX8-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX8-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX8-NEXT: s_getpc_b64 s[0:1] |
| ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX8-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX8-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NEXT: s_mov_b32 s32, 0 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s35 |
| ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v3, 3 |
| ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc |
| ; GFX8-NEXT: s_movk_i32 s0, 0x800 |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 |
| ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v0, vcc |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v0 |
| ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] |
| ; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 |
| ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc |
| ; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] |
| ; GFX8-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: negativeoffset: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX9-NEXT: s_mov_b32 s38, -1 |
| ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 |
| ; GFX9-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX9-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[0:1] |
| ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX9-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: s_mov_b32 s32, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s35 |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v8 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 3 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc |
| ; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 |
| ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:-2048 |
| ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc |
| ; GFX9-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35] |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: negativeoffset: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GFX10-NEXT: s_mov_b32 s38, -1 |
| ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 |
| ; GFX10-NEXT: s_add_u32 s36, s36, s11 |
| ; GFX10-NEXT: s_addc_u32 s37, s37, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_mov_b32_e32 v31, v0 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] |
| ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] |
| ; GFX10-NEXT: s_mov_b32 s32, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 |
| ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 |
| ; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo |
| ; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1 |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off |
| ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo |
| ; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: negativeoffset: |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 |
| ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 |
| ; GFX11-NEXT: s_mov_b32 s32, 0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v4 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v0 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo |
| ; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:-2048 |
| ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo |
| ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35] |
| ; GFX11-NEXT: s_endpgm |
| entry: |
| %call = tail call i64 @_Z13get_global_idj(i32 0) #2 |
| %conv = and i64 %call, 255 |
| %0 = shl i64 %call, 7 |
| %idx.ext11 = and i64 %0, 4294934528 |
| %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 |
| |
| %buffer_wave = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv |
| |
| %addr1 = getelementptr inbounds i64, ptr addrspace(1) %buffer_wave, i64 -536870656 |
| %load1 = load i64, ptr addrspace(1) %addr1, align 8 |
| |
| %addr2 = getelementptr inbounds i64, ptr addrspace(1) %buffer_wave, i64 -536870912 |
| %load2 = load i64, ptr addrspace(1) %addr2, align 8 |
| |
| |
| %add = add i64 %load2, %load1 |
| |
| store i64 %add, ptr addrspace(1) %add.ptr12, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { |
| ; GFX8-LABEL: negativeoffsetnullptr: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_load_dword s1, s[4:5], 0xec |
| ; GFX8-NEXT: s_add_u32 s0, 0, -1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_addc_u32 s1, s1, -1 |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] |
| ; GFX8-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 |
| ; GFX8-NEXT: .LBB8_1: ; %branch |
| ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX8-NEXT: s_and_b64 s[2:3], exec, vcc |
| ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] |
| ; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 |
| ; GFX8-NEXT: ; %bb.2: ; %end |
| ; GFX8-NEXT: s_endpgm |
| ; |
| ; GFX9-LABEL: negativeoffsetnullptr: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, -1, 0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc |
| ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 |
| ; GFX9-NEXT: .LBB8_1: ; %branch |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_and_b64 s[2:3], exec, vcc |
| ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] |
| ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] |
| ; GFX9-NEXT: s_cbranch_execnz .LBB8_1 |
| ; GFX9-NEXT: ; %bb.2: ; %end |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: negativeoffsetnullptr: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base |
| ; GFX10-NEXT: s_add_u32 s0, 0, -1 |
| ; GFX10-NEXT: s_addc_u32 s1, s1, -1 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX10-NEXT: s_mov_b32 s0, 0 |
| ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 |
| ; GFX10-NEXT: .LBB8_1: ; %branch |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_and_b32 s1, exec_lo, vcc_lo |
| ; GFX10-NEXT: s_or_b32 s0, s1, s0 |
| ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 |
| ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 |
| ; GFX10-NEXT: ; %bb.2: ; %end |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: negativeoffsetnullptr: |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base |
| ; GFX11-NEXT: v_add_co_u32 v0, s0, -1, 0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0 |
| ; GFX11-NEXT: s_mov_b32 s0, 0 |
| ; GFX11-NEXT: flat_load_u8 v0, v[0:1] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 |
| ; GFX11-NEXT: .LBB8_1: ; %branch |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_or_b32 s0, s1, s0 |
| ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 |
| ; GFX11-NEXT: s_cbranch_execnz .LBB8_1 |
| ; GFX11-NEXT: ; %bb.2: ; %end |
| ; GFX11-NEXT: s_endpgm |
| entry: |
| %null = select i1 false, ptr %buffer, ptr addrspacecast (ptr addrspace(5) null to ptr) |
| %gep = getelementptr i8, ptr %null, i64 -1 |
| %ld = load i8, ptr %gep |
| %cmp = icmp eq i8 %ld, 0 |
| br label %branch |
| |
| branch: |
| br i1 %cmp, label %end, label %branch |
| |
| end: |
| ret void |
| } |
| |
| |
| attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } |