|  | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
|  | ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s | 
|  | ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s | 
|  | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s | 
|  | ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s | 
|  | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s | 
|  | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s | 
|  |  | 
|  | declare i64 @_Z13get_global_idj(i32) #0 | 
|  |  | 
|  | define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) { | 
|  | ; GFX8-LABEL: clmem_read_simplified: | 
|  | ; GFX8:       ; %bb.0: ; %entry | 
|  | ; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s39, 0xe80000 | 
|  | ; GFX8-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX8-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX8-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX8-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v2, s35 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v3, 3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x800 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x1000 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x1800 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[3:4] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10] | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x2000 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x2800 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16] | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x3000 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x3800, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4] | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v11 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v12, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v13, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v15, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v17, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v18, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc | 
|  | ; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4] | 
|  | ; GFX8-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX9-LABEL: clmem_read_simplified: | 
|  | ; GFX9:       ; %bb.0: ; %entry | 
|  | ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000 | 
|  | ; GFX9-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX9-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX9-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff8000, v1 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v1, s35 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v18 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v3, 3 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    s_movk_i32 s1, 0x2000 | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, s1, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 | 
|  | ; GFX9-NEXT:    s_movk_i32 s0, 0x1000 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off offset:2048 | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048 | 
|  | ; GFX9-NEXT:    s_movk_i32 s0, 0x3000 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc | 
|  | ; GFX9-NEXT:    global_store_dwordx2 v18, v[0:1], s[34:35] | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: clmem_read_simplified: | 
|  | ; GFX10:       ; %bb.0: ; %entry | 
|  | ; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000 | 
|  | ; GFX10-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX10-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX10-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX10-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7] | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v2, 3 | 
|  | ; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1 | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 0x2000 | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[8:9], off offset:-2048 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x3000 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[8:9], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off offset:-2048 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3800, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[0:1], off | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v18, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo | 
|  | ; GFX10-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35] | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX11-LABEL: clmem_read_simplified: | 
|  | ; GFX11:       ; %bb.0: ; %entry | 
|  | ; GFX11-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24 | 
|  | ; GFX11-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1] | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1 | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v16 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo | 
|  | ; GFX11-NEXT:    s_clause 0x1 | 
|  | ; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off | 
|  | ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2048 | 
|  | ; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 0x2000 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x1000, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    s_clause 0x1 | 
|  | ; GFX11-NEXT:    global_load_b64 v[10:11], v[6:7], off offset:-4096 | 
|  | ; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off offset:2048 | 
|  | ; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, 0x2000, v0 | 
|  | ; GFX11-NEXT:    global_load_b64 v[6:7], v[6:7], off | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v13, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    s_clause 0x2 | 
|  | ; GFX11-NEXT:    global_load_b64 v[12:13], v[12:13], off offset:2048 | 
|  | ; GFX11-NEXT:    global_load_b64 v[14:15], v[0:1], off | 
|  | ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v7, v3, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v12, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v15, v3, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo | 
|  | ; GFX11-NEXT:    global_store_b64 v16, v[0:1], s[34:35] | 
|  | ; GFX11-NEXT:    s_endpgm | 
|  | entry: | 
|  | %call = tail call i64 @_Z13get_global_idj(i32 0) | 
|  | %conv = and i64 %call, 255 | 
|  | %a0 = shl i64 %call, 7 | 
|  | %idx.ext11 = and i64 %a0, 4294934528 | 
|  | %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 | 
|  |  | 
|  | %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv | 
|  | %load1 = load i64, ptr addrspace(1) %addr1, align 8 | 
|  | %addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 256 | 
|  | %load2 = load i64, ptr addrspace(1) %addr2, align 8 | 
|  | %add.1 = add i64 %load2, %load1 | 
|  |  | 
|  | %add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 512 | 
|  | %load3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8 | 
|  | %add.2 = add i64 %load3, %add.1 | 
|  | %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 768 | 
|  | %load4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8 | 
|  | %add.3 = add i64 %load4, %add.2 | 
|  |  | 
|  | %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1024 | 
|  | %load5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8 | 
|  | %add.4 = add i64 %load5, %add.3 | 
|  | %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1280 | 
|  | %load6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8 | 
|  | %add.5 = add i64 %load6, %add.4 | 
|  |  | 
|  | %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1536 | 
|  | %load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8 | 
|  | %add.6 = add i64 %load7, %add.5 | 
|  | %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1792 | 
|  | %load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8 | 
|  | %add.7 = add i64 %load8, %add.6 | 
|  |  | 
|  | store i64 %add.7, ptr addrspace(1) %add.ptr12, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) { | 
|  | ; GFX8-LABEL: clmem_read: | 
|  | ; GFX8:       ; %bb.0: ; %entry | 
|  | ; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s39, 0xe80000 | 
|  | ; GFX8-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX8-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX8-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX8-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 17, v0 | 
|  | ; GFX8-NEXT:    v_and_b32_e32 v12, 0xfe000000, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v1, 3 | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v1, s35 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x5000 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v10, 0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v11, 0 | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x7f | 
|  | ; GFX8-NEXT:  .LBB1_1: ; %for.cond.preheader | 
|  | ; GFX8-NEXT:    ; =>This Loop Header: Depth=1 | 
|  | ; GFX8-NEXT:    ; Child Loop BB1_2 Depth 2 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v3, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v2, v0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s1, 0 | 
|  | ; GFX8-NEXT:  .LBB1_2: ; %for.body | 
|  | ; GFX8-NEXT:    ; Parent Loop BB1_1 Depth=1 | 
|  | ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffb000, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[4:5] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xffffb800, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v3, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[6:7] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffc000, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[4:5] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xffffc800, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v3, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffd000, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 0xffffd800, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, -1, v3, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, 0xffffe000, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, -1, v3, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[4:5] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[19:20] | 
|  | ; GFX8-NEXT:    s_addk_i32 s1, 0x2000 | 
|  | ; GFX8-NEXT:    s_cmp_gt_u32 s1, 0x3fffff | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v13, v10 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, v14, v11, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0xffffe800, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, -1, v3, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0xfffff000, v2 | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[19:20], v[21:22] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11] | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, -1, v3, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v15, v23 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, v16, v24, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0xfffff800, v2 | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14] | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, -1, v3, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16] | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v17, v21 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, v18, v22, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[2:3] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x10000, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v21 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v22, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v6 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v19, v4 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v20, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v10, v4 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v11, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v13, v4 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v15, v4 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v17, v4 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v18, v5, vcc | 
|  | ; GFX8-NEXT:    s_cbranch_scc0 .LBB1_2 | 
|  | ; GFX8-NEXT:  ; %bb.3: ; %while.cond.loopexit | 
|  | ; GFX8-NEXT:    ; in Loop: Header=BB1_1 Depth=1 | 
|  | ; GFX8-NEXT:    s_add_i32 s1, s0, -1 | 
|  | ; GFX8-NEXT:    s_cmp_eq_u32 s0, 0 | 
|  | ; GFX8-NEXT:    s_cbranch_scc1 .LBB1_5 | 
|  | ; GFX8-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s0, s1 | 
|  | ; GFX8-NEXT:    s_branch .LBB1_1 | 
|  | ; GFX8-NEXT:  .LBB1_5: ; %while.end | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v1, s35 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v12 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[10:11] | 
|  | ; GFX8-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX900-LABEL: clmem_read: | 
|  | ; GFX900:       ; %bb.0: ; %entry | 
|  | ; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX900-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX900-NEXT:    s_mov_b32 s39, 0xe00000 | 
|  | ; GFX900-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX900-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX900-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX900-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX900-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX900-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX900-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0 | 
|  | ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 17, v0 | 
|  | ; GFX900-NEXT:    v_and_b32_e32 v6, 0xfe000000, v0 | 
|  | ; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 3, v6 | 
|  | ; GFX900-NEXT:    v_mov_b32_e32 v1, s35 | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX900-NEXT:    s_movk_i32 s0, 0x5000 | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0 | 
|  | ; GFX900-NEXT:    v_mov_b32_e32 v4, 0 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX900-NEXT:    v_mov_b32_e32 v5, 0 | 
|  | ; GFX900-NEXT:    s_movk_i32 s5, 0x7f | 
|  | ; GFX900-NEXT:    s_movk_i32 s2, 0xd000 | 
|  | ; GFX900-NEXT:    s_movk_i32 s3, 0xe000 | 
|  | ; GFX900-NEXT:    s_movk_i32 s4, 0xf000 | 
|  | ; GFX900-NEXT:  .LBB1_1: ; %for.cond.preheader | 
|  | ; GFX900-NEXT:    ; =>This Loop Header: Depth=1 | 
|  | ; GFX900-NEXT:    ; Child Loop BB1_2 Depth 2 | 
|  | ; GFX900-NEXT:    v_mov_b32_e32 v3, v1 | 
|  | ; GFX900-NEXT:    v_mov_b32_e32 v2, v0 | 
|  | ; GFX900-NEXT:    s_mov_b32 s6, 0 | 
|  | ; GFX900-NEXT:  .LBB1_2: ; %for.body | 
|  | ; GFX900-NEXT:    ; Parent Loop BB1_1 Depth=1 | 
|  | ; GFX900-NEXT:    ; => This Inner Loop Header: Depth=2 | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, 0xffffb000, v2 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, -1, v3, vcc | 
|  | ; GFX900-NEXT:    global_load_dwordx2 v[9:10], v[2:3], off offset:-4096 | 
|  | ; GFX900-NEXT:    global_load_dwordx2 v[11:12], v[2:3], off offset:-2048 | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v13, vcc, 0xffffc000, v2 | 
|  | ; GFX900-NEXT:    global_load_dwordx2 v[7:8], v[7:8], off | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v14, vcc, -1, v3, vcc | 
|  | ; GFX900-NEXT:    global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 | 
|  | ; GFX900-NEXT:    global_load_dwordx2 v[19:20], v[13:14], off | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v15, vcc, s2, v2 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v16, vcc, -1, v3, vcc | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v13, vcc, s3, v2 | 
|  | ; GFX900-NEXT:    global_load_dwordx2 v[15:16], v[15:16], off offset:-2048 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v14, vcc, -1, v3, vcc | 
|  | ; GFX900-NEXT:    s_addk_i32 s6, 0x2000 | 
|  | ; GFX900-NEXT:    s_cmp_gt_u32 s6, 0x3fffff | 
|  | ; GFX900-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v21, vcc, v7, v4 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v5, vcc | 
|  | ; GFX900-NEXT:    global_load_dwordx2 v[7:8], v[13:14], off offset:-4096 | 
|  | ; GFX900-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX900-NEXT:    v_add_co_u32_e64 v23, s[0:1], v17, v21 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e64 v24, s[0:1], v18, v5, s[0:1] | 
|  | ; GFX900-NEXT:    global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 | 
|  | ; GFX900-NEXT:    global_load_dwordx2 v[21:22], v[13:14], off | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v2 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v3, vcc | 
|  | ; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off offset:-2048 | 
|  | ; GFX900-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v23 | 
|  | ; GFX900-NEXT:    global_load_dwordx2 v[13:14], v[2:3], off | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v20, vcc, v20, v24, vcc | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, 0x10000, v2 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
|  | ; GFX900-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v15, vcc, v15, v19 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v16, vcc, v16, v20, vcc | 
|  | ; GFX900-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v15 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v16, vcc | 
|  | ; GFX900-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, v17, v7 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, v18, v8, vcc | 
|  | ; GFX900-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, v21, v7 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, v22, v8, vcc | 
|  | ; GFX900-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v8, vcc | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v9, v4 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v10, v5, vcc | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v4 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v12, v5, vcc | 
|  | ; GFX900-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, v13, v4 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v14, v5, vcc | 
|  | ; GFX900-NEXT:    s_cbranch_scc0 .LBB1_2 | 
|  | ; GFX900-NEXT:  ; %bb.3: ; %while.cond.loopexit | 
|  | ; GFX900-NEXT:    ; in Loop: Header=BB1_1 Depth=1 | 
|  | ; GFX900-NEXT:    s_add_i32 s0, s5, -1 | 
|  | ; GFX900-NEXT:    s_cmp_eq_u32 s5, 0 | 
|  | ; GFX900-NEXT:    s_cbranch_scc1 .LBB1_5 | 
|  | ; GFX900-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 | 
|  | ; GFX900-NEXT:    s_mov_b32 s5, s0 | 
|  | ; GFX900-NEXT:    s_branch .LBB1_1 | 
|  | ; GFX900-NEXT:  .LBB1_5: ; %while.end | 
|  | ; GFX900-NEXT:    v_mov_b32_e32 v1, s35 | 
|  | ; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v6 | 
|  | ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off | 
|  | ; GFX900-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: clmem_read: | 
|  | ; GFX10:       ; %bb.0: ; %entry | 
|  | ; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000 | 
|  | ; GFX10-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX10-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX10-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX10-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7] | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 17, v0 | 
|  | ; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v2, 0 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v3, 0 | 
|  | ; GFX10-NEXT:    s_movk_i32 s1, 0x7f | 
|  | ; GFX10-NEXT:    v_and_b32_e32 v6, 0xfe000000, v1 | 
|  | ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 3, v6 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, s34 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0, s35, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x5000, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:  .LBB1_1: ; %for.cond.preheader | 
|  | ; GFX10-NEXT:    ; =>This Loop Header: Depth=1 | 
|  | ; GFX10-NEXT:    ; Child Loop BB1_2 Depth 2 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v5, v1 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v4, v0 | 
|  | ; GFX10-NEXT:    s_mov_b32 s2, 0 | 
|  | ; GFX10-NEXT:  .LBB1_2: ; %for.body | 
|  | ; GFX10-NEXT:    ; Parent Loop BB1_1 Depth=1 | 
|  | ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v4, 0xffffb800 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, -1, v5, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v4, 0xffffc800 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, -1, v5, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v13, vcc_lo, v4, 0xffffd800 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, -1, v5, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v17, vcc_lo, v4, 0xffffe800 | 
|  | ; GFX10-NEXT:    s_clause 0x2 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[11:12], v[7:8], off offset:-2048 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[15:16], v[9:10], off offset:-2048 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[19:20], v[13:14], off offset:-2048 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, -1, v5, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v21, vcc_lo, 0xfffff000, v4 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, -1, v5, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x7 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[23:24], v[17:18], off offset:-2048 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[7:8], v[7:8], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[9:10], v[9:10], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[13:14], v[13:14], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[25:26], v[17:18], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[27:28], v[21:22], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[29:30], v[4:5], off offset:-2048 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[31:32], v[4:5], off | 
|  | ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x10000, v4 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo | 
|  | ; GFX10-NEXT:    s_addk_i32 s2, 0x2000 | 
|  | ; GFX10-NEXT:    s_cmp_gt_u32 s2, 0x3fffff | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(10) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, s0, v11, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v12, v3, s0 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, s0, v7, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v8, v3, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, s0, v15, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v16, v3, s0 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, s0, v9, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v10, v3, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, s0, v19, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v20, v3, s0 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, s0, v13, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v14, v3, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, s0, v23, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v24, v3, s0 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, s0, v25, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v26, v3, s0 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, s0, v27, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v28, v3, s0 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, s0, v29, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v30, v3, s0 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v31, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v32, v3, vcc_lo | 
|  | ; GFX10-NEXT:    s_cbranch_scc0 .LBB1_2 | 
|  | ; GFX10-NEXT:  ; %bb.3: ; %while.cond.loopexit | 
|  | ; GFX10-NEXT:    ; in Loop: Header=BB1_1 Depth=1 | 
|  | ; GFX10-NEXT:    s_add_i32 s0, s1, -1 | 
|  | ; GFX10-NEXT:    s_cmp_eq_u32 s1, 0 | 
|  | ; GFX10-NEXT:    s_cbranch_scc1 .LBB1_5 | 
|  | ; GFX10-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s1, s0 | 
|  | ; GFX10-NEXT:    s_branch .LBB1_1 | 
|  | ; GFX10-NEXT:  .LBB1_5: ; %while.end | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, s0, s34, v6 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s35, 0, s0 | 
|  | ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX90A-LABEL: clmem_read: | 
|  | ; GFX90A:       ; %bb.0: ; %entry | 
|  | ; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX90A-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000 | 
|  | ; GFX90A-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX90A-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX90A-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX90A-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX90A-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX90A-NEXT:    v_and_b32_e32 v1, 0xff, v0 | 
|  | ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 17, v0 | 
|  | ; GFX90A-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0 | 
|  | ; GFX90A-NEXT:    v_lshl_or_b32 v1, v1, 3, v0 | 
|  | ; GFX90A-NEXT:    v_mov_b32_e32 v2, s35 | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, s34, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v2, vcc | 
|  | ; GFX90A-NEXT:    s_movk_i32 s0, 0x5000 | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
|  | ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], 0, 0 | 
|  | ; GFX90A-NEXT:    s_movk_i32 s3, 0x7f | 
|  | ; GFX90A-NEXT:    s_movk_i32 s0, 0xd000 | 
|  | ; GFX90A-NEXT:    s_movk_i32 s1, 0xe000 | 
|  | ; GFX90A-NEXT:    s_movk_i32 s2, 0xf000 | 
|  | ; GFX90A-NEXT:  .LBB1_1: ; %for.cond.preheader | 
|  | ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1 | 
|  | ; GFX90A-NEXT:    ; Child Loop BB1_2 Depth 2 | 
|  | ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1] | 
|  | ; GFX90A-NEXT:    s_mov_b32 s4, 0 | 
|  | ; GFX90A-NEXT:  .LBB1_2: ; %for.body | 
|  | ; GFX90A-NEXT:    ; Parent Loop BB1_1 Depth=1 | 
|  | ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2 | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v12, vcc, 0xffffb000, v6 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, -1, v7, vcc | 
|  | ; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 | 
|  | ; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 | 
|  | ; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v7, vcc | 
|  | ; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 | 
|  | ; GFX90A-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v16, vcc, s0, v6 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v17, vcc, -1, v7, vcc | 
|  | ; GFX90A-NEXT:    global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, s1, v6 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v7, vcc | 
|  | ; GFX90A-NEXT:    global_load_dwordx2 v[24:25], v[14:15], off offset:-4096 | 
|  | ; GFX90A-NEXT:    global_load_dwordx2 v[26:27], v[14:15], off offset:-2048 | 
|  | ; GFX90A-NEXT:    global_load_dwordx2 v[28:29], v[14:15], off | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v22, vcc, s2, v6 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v7, vcc | 
|  | ; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[22:23], off offset:-2048 | 
|  | ; GFX90A-NEXT:    global_load_dwordx2 v[30:31], v[6:7], off | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x10000, v6 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc | 
|  | ; GFX90A-NEXT:    s_addk_i32 s4, 0x2000 | 
|  | ; GFX90A-NEXT:    s_cmp_gt_u32 s4, 0x3fffff | 
|  | ; GFX90A-NEXT:    s_waitcnt vmcnt(8) | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v12, v4 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc | 
|  | ; GFX90A-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v18, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v19, v4, vcc | 
|  | ; GFX90A-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v20, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v21, v4, vcc | 
|  | ; GFX90A-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v16, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v17, v4, vcc | 
|  | ; GFX90A-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v24, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v25, v4, vcc | 
|  | ; GFX90A-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v26, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v27, v4, vcc | 
|  | ; GFX90A-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v28, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v29, v4, vcc | 
|  | ; GFX90A-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v14, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v15, v4, vcc | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v8, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v4, vcc | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v10, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v4, vcc | 
|  | ; GFX90A-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v30, v1 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v31, v5, vcc | 
|  | ; GFX90A-NEXT:    s_cbranch_scc0 .LBB1_2 | 
|  | ; GFX90A-NEXT:  ; %bb.3: ; %while.cond.loopexit | 
|  | ; GFX90A-NEXT:    ; in Loop: Header=BB1_1 Depth=1 | 
|  | ; GFX90A-NEXT:    s_add_i32 s4, s3, -1 | 
|  | ; GFX90A-NEXT:    s_cmp_eq_u32 s3, 0 | 
|  | ; GFX90A-NEXT:    s_cbranch_scc1 .LBB1_5 | 
|  | ; GFX90A-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 | 
|  | ; GFX90A-NEXT:    s_mov_b32 s3, s4 | 
|  | ; GFX90A-NEXT:    s_branch .LBB1_1 | 
|  | ; GFX90A-NEXT:  .LBB1_5: ; %while.end | 
|  | ; GFX90A-NEXT:    v_mov_b32_e32 v1, s35 | 
|  | ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0 | 
|  | ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX90A-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off | 
|  | ; GFX90A-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX11-LABEL: clmem_read: | 
|  | ; GFX11:       ; %bb.0: ; %entry | 
|  | ; GFX11-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24 | 
|  | ; GFX11-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1] | 
|  | ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 17, v0 | 
|  | ; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xff, v0 | 
|  | ; GFX11-NEXT:    s_movk_i32 s1, 0x7f | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v6, 0xfe000000, v1 | 
|  | ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 3, v6 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, s0, v0, s34 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s35, s0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x5000, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:  .LBB1_1: ; %for.cond.preheader | 
|  | ; GFX11-NEXT:    ; =>This Loop Header: Depth=1 | 
|  | ; GFX11-NEXT:    ; Child Loop BB1_2 Depth 2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 | 
|  | ; GFX11-NEXT:    s_mov_b32 s2, 0 | 
|  | ; GFX11-NEXT:  .LBB1_2: ; %for.body | 
|  | ; GFX11-NEXT:    ; Parent Loop BB1_1 Depth=1 | 
|  | ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v7, vcc_lo, v4, 0xffffc000 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, -1, v5, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v9, vcc_lo, 0xffffc000, v4 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, null, -1, v5, vcc_lo | 
|  | ; GFX11-NEXT:    global_load_b64 v[13:14], v[7:8], off offset:-4096 | 
|  | ; GFX11-NEXT:    v_add_co_u32 v11, vcc_lo, 0xffffd000, v4 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v12, null, -1, v5, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v15, vcc_lo, v4, 0xffffe000 | 
|  | ; GFX11-NEXT:    global_load_b64 v[9:10], v[9:10], off offset:-2048 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v16, null, -1, v5, vcc_lo | 
|  | ; GFX11-NEXT:    global_load_b64 v[11:12], v[11:12], off offset:-2048 | 
|  | ; GFX11-NEXT:    v_add_co_u32 v17, vcc_lo, 0xffffe000, v4 | 
|  | ; GFX11-NEXT:    s_clause 0x1 | 
|  | ; GFX11-NEXT:    global_load_b64 v[19:20], v[15:16], off offset:-4096 | 
|  | ; GFX11-NEXT:    global_load_b64 v[7:8], v[7:8], off | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v18, null, -1, v5, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v21, vcc_lo, 0xfffff000, v4 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v22, null, -1, v5, vcc_lo | 
|  | ; GFX11-NEXT:    s_clause 0x5 | 
|  | ; GFX11-NEXT:    global_load_b64 v[17:18], v[17:18], off offset:-2048 | 
|  | ; GFX11-NEXT:    global_load_b64 v[15:16], v[15:16], off | 
|  | ; GFX11-NEXT:    global_load_b64 v[21:22], v[21:22], off offset:-2048 | 
|  | ; GFX11-NEXT:    global_load_b64 v[23:24], v[4:5], off offset:-4096 | 
|  | ; GFX11-NEXT:    global_load_b64 v[25:26], v[4:5], off offset:-2048 | 
|  | ; GFX11-NEXT:    global_load_b64 v[27:28], v[4:5], off | 
|  | ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x10000, v4 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo | 
|  | ; GFX11-NEXT:    s_addk_i32 s2, 0x2000 | 
|  | ; GFX11-NEXT:    s_cmp_gt_u32 s2, 0x3fffff | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(10) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, s0, v13, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v14, v3, s0 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(9) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, s0, v9, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v10, v3, s0 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, s0, v7, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v8, v3, s0 | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, s0, v11, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v12, v3, s0 | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, s0, v19, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v20, v3, s0 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, s0, v17, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v18, v3, s0 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, s0, v15, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v16, v3, s0 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, s0, v21, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v22, v3, s0 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, s0, v23, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v24, v3, s0 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, s0, v25, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v26, v3, s0 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v27, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v28, v3, vcc_lo | 
|  | ; GFX11-NEXT:    s_cbranch_scc0 .LBB1_2 | 
|  | ; GFX11-NEXT:  ; %bb.3: ; %while.cond.loopexit | 
|  | ; GFX11-NEXT:    ; in Loop: Header=BB1_1 Depth=1 | 
|  | ; GFX11-NEXT:    s_add_i32 s0, s1, -1 | 
|  | ; GFX11-NEXT:    s_cmp_eq_u32 s1, 0 | 
|  | ; GFX11-NEXT:    s_cbranch_scc1 .LBB1_5 | 
|  | ; GFX11-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 | 
|  | ; GFX11-NEXT:    s_mov_b32 s1, s0 | 
|  | ; GFX11-NEXT:    s_branch .LBB1_1 | 
|  | ; GFX11-NEXT:  .LBB1_5: ; %while.end | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, s0, s34, v6 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s35, 0, s0 | 
|  | ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off | 
|  | ; GFX11-NEXT:    s_endpgm | 
|  | entry: | 
|  | %call = tail call i64 @_Z13get_global_idj(i32 0) | 
|  | %conv = and i64 %call, 255 | 
|  | %a0 = shl i64 %call, 17 | 
|  | %idx.ext11 = and i64 %a0, 4261412864 | 
|  | %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 | 
|  | %add.ptr6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv | 
|  | br label %for.cond.preheader | 
|  |  | 
|  | while.cond.loopexit:                              ; preds = %for.body | 
|  | %dec = add nsw i32 %dec31, -1 | 
|  | %tobool = icmp eq i32 %dec31, 0 | 
|  | br i1 %tobool, label %while.end, label %for.cond.preheader | 
|  |  | 
|  | for.cond.preheader:                               ; preds = %entry, %while.cond.loopexit | 
|  | %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ] | 
|  | %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ] | 
|  | br label %for.body | 
|  |  | 
|  | for.body:                                         ; preds = %for.body, %for.cond.preheader | 
|  | %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ] | 
|  | %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ] | 
|  | %conv3 = zext i32 %block.029 to i64 | 
|  | %add.ptr8 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3 | 
|  | %load1 = load i64, ptr addrspace(1) %add.ptr8, align 8 | 
|  | %add = add i64 %load1, %sum.128 | 
|  |  | 
|  | %add9 = or disjoint i32 %block.029, 256 | 
|  | %conv3.1 = zext i32 %add9 to i64 | 
|  | %add.ptr8.1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.1 | 
|  | %load2 = load i64, ptr addrspace(1) %add.ptr8.1, align 8 | 
|  | %add.1 = add i64 %load2, %add | 
|  |  | 
|  | %add9.1 = or disjoint i32 %block.029, 512 | 
|  | %conv3.2 = zext i32 %add9.1 to i64 | 
|  | %add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.2 | 
|  | %l3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8 | 
|  | %add.2 = add i64 %l3, %add.1 | 
|  |  | 
|  | %add9.2 = or disjoint i32 %block.029, 768 | 
|  | %conv3.3 = zext i32 %add9.2 to i64 | 
|  | %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.3 | 
|  | %l4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8 | 
|  | %add.3 = add i64 %l4, %add.2 | 
|  |  | 
|  | %add9.3 = or disjoint i32 %block.029, 1024 | 
|  | %conv3.4 = zext i32 %add9.3 to i64 | 
|  | %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.4 | 
|  | %l5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8 | 
|  | %add.4 = add i64 %l5, %add.3 | 
|  |  | 
|  | %add9.4 = or disjoint i32 %block.029, 1280 | 
|  | %conv3.5 = zext i32 %add9.4 to i64 | 
|  | %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.5 | 
|  | %l6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8 | 
|  | %add.5 = add i64 %l6, %add.4 | 
|  |  | 
|  | %add9.5 = or disjoint i32 %block.029, 1536 | 
|  | %conv3.6 = zext i32 %add9.5 to i64 | 
|  | %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.6 | 
|  | %load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8 | 
|  | %add.6 = add i64 %load7, %add.5 | 
|  |  | 
|  | %add9.6 = or disjoint i32 %block.029, 1792 | 
|  | %conv3.7 = zext i32 %add9.6 to i64 | 
|  | %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.7 | 
|  | %load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8 | 
|  | %add.7 = add i64 %load8, %add.6 | 
|  |  | 
|  | %add9.7 = or disjoint i32 %block.029, 2048 | 
|  | %conv3.8 = zext i32 %add9.7 to i64 | 
|  | %add.ptr8.8 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.8 | 
|  | %load9 = load i64, ptr addrspace(1) %add.ptr8.8, align 8 | 
|  | %add.8 = add i64 %load9, %add.7 | 
|  |  | 
|  | %add9.8 = or disjoint i32 %block.029, 2304 | 
|  | %conv3.9 = zext i32 %add9.8 to i64 | 
|  | %add.ptr8.9 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.9 | 
|  | %load10 = load i64, ptr addrspace(1) %add.ptr8.9, align 8 | 
|  | %add.9 = add i64 %load10, %add.8 | 
|  |  | 
|  | %add9.9 = or disjoint i32 %block.029, 2560 | 
|  | %conv3.10 = zext i32 %add9.9 to i64 | 
|  | %add.ptr8.10 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr6, i64 %conv3.10 | 
|  | %load11 = load i64, ptr addrspace(1) %add.ptr8.10, align 8 | 
|  | %add.10 = add i64 %load11, %add.9 | 
|  |  | 
|  | %add9.31 = add nuw nsw i32 %block.029, 8192 | 
|  | %cmp.31 = icmp ult i32 %add9.31, 4194304 | 
|  | br i1 %cmp.31, label %for.body, label %while.cond.loopexit | 
|  |  | 
|  | while.end:                                        ; preds = %while.cond.loopexit | 
|  | store i64 %add.10, ptr addrspace(1) %add.ptr12, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | ; using 32bit address. | 
|  | define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { | 
|  | ; GFX8-LABEL: Address32: | 
|  | ; GFX8:       ; %bb.0: ; %entry | 
|  | ; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s39, 0xe80000 | 
|  | ; GFX8-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX8-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX8-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX8-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v2, s35 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v3, 2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x400 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x800 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0xc00 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x1000 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x1400 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x1800 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x1c00 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x2000 | 
|  | ; GFX8-NEXT:    flat_load_dword v0, v[3:4] | 
|  | ; GFX8-NEXT:    flat_load_dword v19, v[5:6] | 
|  | ; GFX8-NEXT:    flat_load_dword v7, v[7:8] | 
|  | ; GFX8-NEXT:    flat_load_dword v8, v[9:10] | 
|  | ; GFX8-NEXT:    flat_load_dword v9, v[11:12] | 
|  | ; GFX8-NEXT:    flat_load_dword v10, v[13:14] | 
|  | ; GFX8-NEXT:    flat_load_dword v11, v[15:16] | 
|  | ; GFX8-NEXT:    flat_load_dword v12, v[17:18] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x2400, v3 | 
|  | ; GFX8-NEXT:    flat_load_dword v5, v[5:6] | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dword v3, v[3:4] | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(8) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v19, v0 | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0 | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v8, v0 | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0 | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v10, v0 | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v11, v0 | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v12, v0 | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0 | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0 | 
|  | ; GFX8-NEXT:    flat_store_dword v[1:2], v0 | 
|  | ; GFX8-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX9-LABEL: Address32: | 
|  | ; GFX9:       ; %bb.0: ; %entry | 
|  | ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000 | 
|  | ; GFX9-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX9-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX9-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v1, s35 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v4 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v3, 2 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    s_movk_i32 s0, 0x1000 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    global_load_dword v5, v[0:1], off | 
|  | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:1024 | 
|  | ; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:2048 | 
|  | ; GFX9-NEXT:    global_load_dword v8, v[0:1], off offset:3072 | 
|  | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
|  | ; GFX9-NEXT:    global_load_dword v10, v[2:3], off offset:1024 | 
|  | ; GFX9-NEXT:    global_load_dword v11, v[2:3], off offset:2048 | 
|  | ; GFX9-NEXT:    global_load_dword v12, v[2:3], off offset:3072 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    global_load_dword v2, v[0:1], off | 
|  | ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:1024 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(8) | 
|  | ; GFX9-NEXT:    v_add_u32_e32 v0, v6, v5 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX9-NEXT:    v_add3_u32 v0, v7, v0, v8 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX9-NEXT:    v_add3_u32 v0, v11, v0, v12 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v3 | 
|  | ; GFX9-NEXT:    global_store_dword v4, v0, s[34:35] | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: Address32: | 
|  | ; GFX10:       ; %bb.0: ; %entry | 
|  | ; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000 | 
|  | ; GFX10-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX10-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX10-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX10-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7] | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v2, 2 | 
|  | ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff8000, v1 | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v10 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x1000 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x3 | 
|  | ; GFX10-NEXT:    global_load_dword v11, v[0:1], off | 
|  | ; GFX10-NEXT:    global_load_dword v12, v[0:1], off offset:1024 | 
|  | ; GFX10-NEXT:    global_load_dword v13, v[4:5], off offset:-2048 | 
|  | ; GFX10-NEXT:    global_load_dword v14, v[2:3], off offset:1024 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, 0x1000, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1800, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 0x2000 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x2 | 
|  | ; GFX10-NEXT:    global_load_dword v15, v[4:5], off | 
|  | ; GFX10-NEXT:    global_load_dword v16, v[6:7], off offset:1024 | 
|  | ; GFX10-NEXT:    global_load_dword v17, v[2:3], off offset:1024 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x2 | 
|  | ; GFX10-NEXT:    global_load_dword v2, v[8:9], off offset:-2048 | 
|  | ; GFX10-NEXT:    global_load_dword v3, v[8:9], off | 
|  | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:1024 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(8) | 
|  | ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v12, v11 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX10-NEXT:    v_add3_u32 v0, v13, v0, v14 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX10-NEXT:    v_add3_u32 v0, v15, v0, v16 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX10-NEXT:    v_add3_u32 v0, v2, v0, v17 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_add3_u32 v0, v3, v0, v4 | 
|  | ; GFX10-NEXT:    global_store_dword v10, v0, s[34:35] | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX11-LABEL: Address32: | 
|  | ; GFX11:       ; %bb.0: ; %entry | 
|  | ; GFX11-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24 | 
|  | ; GFX11-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1] | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1 | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v6 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo | 
|  | ; GFX11-NEXT:    s_clause 0x1 | 
|  | ; GFX11-NEXT:    global_load_b32 v7, v[0:1], off | 
|  | ; GFX11-NEXT:    global_load_b32 v8, v[0:1], off offset:1024 | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x2000 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    s_clause 0x5 | 
|  | ; GFX11-NEXT:    global_load_b32 v9, v[0:1], off offset:2048 | 
|  | ; GFX11-NEXT:    global_load_b32 v10, v[0:1], off offset:3072 | 
|  | ; GFX11-NEXT:    global_load_b32 v11, v[4:5], off offset:-4096 | 
|  | ; GFX11-NEXT:    global_load_b32 v12, v[2:3], off offset:1024 | 
|  | ; GFX11-NEXT:    global_load_b32 v13, v[2:3], off offset:2048 | 
|  | ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:3072 | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    s_clause 0x1 | 
|  | ; GFX11-NEXT:    global_load_b32 v3, v[4:5], off | 
|  | ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:1024 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(8) | 
|  | ; GFX11-NEXT:    v_add_nc_u32_e32 v1, v8, v7 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add3_u32 v1, v9, v1, v10 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX11-NEXT:    v_add3_u32 v1, v11, v1, v12 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add3_u32 v1, v13, v1, v2 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX11-NEXT:    v_add3_u32 v0, v3, v1, v0 | 
|  | ; GFX11-NEXT:    global_store_b32 v6, v0, s[34:35] | 
|  | ; GFX11-NEXT:    s_endpgm | 
|  | entry: | 
|  | %call = tail call i64 @_Z13get_global_idj(i32 0) | 
|  | %conv = and i64 %call, 255 | 
|  | %id = shl i64 %call, 7 | 
|  | %idx.ext11 = and i64 %id, 4294934528 | 
|  | %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 | 
|  |  | 
|  | %add.ptr6 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr12, i64 %conv | 
|  | %load1 = load i32, ptr addrspace(1) %add.ptr6, align 4 | 
|  |  | 
|  | %add.ptr8.1 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 256 | 
|  | %load2 = load i32, ptr addrspace(1) %add.ptr8.1, align 4 | 
|  | %add.1 = add i32 %load2, %load1 | 
|  |  | 
|  | %add.ptr8.2 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 512 | 
|  | %load3 = load i32, ptr addrspace(1) %add.ptr8.2, align 4 | 
|  | %add.2 = add i32 %load3, %add.1 | 
|  |  | 
|  | %add.ptr8.3 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 768 | 
|  | %load4 = load i32, ptr addrspace(1) %add.ptr8.3, align 4 | 
|  | %add.3 = add i32 %load4, %add.2 | 
|  |  | 
|  | %add.ptr8.4 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1024 | 
|  | %load5 = load i32, ptr addrspace(1) %add.ptr8.4, align 4 | 
|  | %add.4 = add i32 %load5, %add.3 | 
|  |  | 
|  | %add.ptr8.5 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1280 | 
|  | %load6 = load i32, ptr addrspace(1) %add.ptr8.5, align 4 | 
|  | %add.5 = add i32 %load6, %add.4 | 
|  |  | 
|  | %add.ptr8.6 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1536 | 
|  | %load7 = load i32, ptr addrspace(1) %add.ptr8.6, align 4 | 
|  | %add.6 = add i32 %load7, %add.5 | 
|  |  | 
|  | %add.ptr8.7 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 1792 | 
|  | %load8 = load i32, ptr addrspace(1) %add.ptr8.7, align 4 | 
|  | %add.7 = add i32 %load8, %add.6 | 
|  |  | 
|  | %add.ptr8.8 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 2048 | 
|  | %load9 = load i32, ptr addrspace(1) %add.ptr8.8, align 4 | 
|  | %add.8 = add i32 %load9, %add.7 | 
|  |  | 
|  | %add.ptr8.9 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr6, i64 2304 | 
|  | %load10 = load i32, ptr addrspace(1) %add.ptr8.9, align 4 | 
|  | %add.9 = add i32 %load10, %add.8 | 
|  |  | 
|  | store i32 %add.9, ptr addrspace(1) %add.ptr12, align 4 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) { | 
|  | ; GFX8-LABEL: Offset64: | 
|  | ; GFX8:       ; %bb.0: ; %entry | 
|  | ; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s39, 0xe80000 | 
|  | ; GFX8-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX8-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX8-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX8-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v2, s35 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v3, 3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0xf000 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0xf800 | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[3:4] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v4 | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4] | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v7 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v8, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc | 
|  | ; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4] | 
|  | ; GFX8-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX9-LABEL: Offset64: | 
|  | ; GFX9:       ; %bb.0: ; %entry | 
|  | ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000 | 
|  | ; GFX9-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX9-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX9-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff8000, v1 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v1, s35 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v10 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v3, 3 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    s_movk_i32 s0, 0xf000 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:2048 | 
|  | ; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1 | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v4 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v5, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v3, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc | 
|  | ; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[34:35] | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: Offset64: | 
|  | ; GFX10:       ; %bb.0: ; %entry | 
|  | ; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000 | 
|  | ; GFX10-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX10-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX10-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX10-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7] | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v2, 3 | 
|  | ; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1 | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v12 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0xfffff800 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048 | 
|  | ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1 | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo | 
|  | ; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35] | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX11-LABEL: Offset64: | 
|  | ; GFX11:       ; %bb.0: ; %entry | 
|  | ; GFX11-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24 | 
|  | ; GFX11-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1] | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1 | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v8 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0xfffff000, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    s_clause 0x2 | 
|  | ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off | 
|  | ; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off | 
|  | ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:2048 | 
|  | ; GFX11-NEXT:    v_add_nc_u32_e32 v1, 1, v1 | 
|  | ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, v7, v5, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo | 
|  | ; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[34:35] | 
|  | ; GFX11-NEXT:    s_endpgm | 
|  | entry: | 
|  | %call = tail call i64 @_Z13get_global_idj(i32 0) | 
|  | %conv = and i64 %call, 255 | 
|  | %a0 = shl i64 %call, 7 | 
|  | %idx.ext11 = and i64 %a0, 4294934528 | 
|  | %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 | 
|  |  | 
|  | %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv | 
|  | %load1 = load i64, ptr addrspace(1) %addr1, align 8 | 
|  |  | 
|  | %addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870400 | 
|  | %load2 = load i64, ptr addrspace(1) %addr2, align 8 | 
|  |  | 
|  | %add1 = add i64 %load2, %load1 | 
|  |  | 
|  | %addr3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870656 | 
|  | %load3 = load i64, ptr addrspace(1) %addr3, align 8 | 
|  |  | 
|  | %add2 = add i64 %load3, %add1 | 
|  |  | 
|  | %addr4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 536870912 | 
|  | %load4 = load i64, ptr addrspace(1) %addr4, align 8 | 
|  | %add4 = add i64 %load4, %add2 | 
|  |  | 
|  | store i64 %add4, ptr addrspace(1) %add.ptr12, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | ; TODO: Support load4 as anchor instruction. | 
|  | define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) { | 
|  | ; GFX8-LABEL: p32Offset64: | 
|  | ; GFX8:       ; %bb.0: ; %entry | 
|  | ; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s39, 0xe80000 | 
|  | ; GFX8-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX8-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX8-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX8-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v2, s35 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v3, 2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    s_mov_b32 s0, 0x7ffff800 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_mov_b32 s0, 0x7ffffc00 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dword v0, v[3:4] | 
|  | ; GFX8-NEXT:    flat_load_dword v5, v[5:6] | 
|  | ; GFX8-NEXT:    flat_load_dword v6, v[7:8] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x80000000, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dword v3, v[3:4] | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0 | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0 | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0 | 
|  | ; GFX8-NEXT:    flat_store_dword v[1:2], v0 | 
|  | ; GFX8-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX9-LABEL: p32Offset64: | 
|  | ; GFX9:       ; %bb.0: ; %entry | 
|  | ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000 | 
|  | ; GFX9-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX9-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX9-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v1, s35 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v6 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v3, 2 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    s_mov_b32 s0, 0x7ffff000 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0x80000000, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    global_load_dword v7, v[0:1], off | 
|  | ; GFX9-NEXT:    global_load_dword v8, v[2:3], off offset:2048 | 
|  | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off offset:3072 | 
|  | ; GFX9-NEXT:    global_load_dword v10, v[4:5], off | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX9-NEXT:    v_add_u32_e32 v0, v8, v7 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10 | 
|  | ; GFX9-NEXT:    global_store_dword v6, v0, s[34:35] | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: p32Offset64: | 
|  | ; GFX10:       ; %bb.0: ; %entry | 
|  | ; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000 | 
|  | ; GFX10-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX10-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX10-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX10-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7] | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v2, 2 | 
|  | ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1 | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v6 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x80000000 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7ffff800, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x3 | 
|  | ; GFX10-NEXT:    global_load_dword v7, v[0:1], off | 
|  | ; GFX10-NEXT:    global_load_dword v8, v[2:3], off offset:-2048 | 
|  | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
|  | ; GFX10-NEXT:    global_load_dword v10, v[4:5], off offset:1024 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v8, v7 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_add3_u32 v0, v10, v0, v9 | 
|  | ; GFX10-NEXT:    global_store_dword v6, v0, s[34:35] | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX11-LABEL: p32Offset64: | 
|  | ; GFX11:       ; %bb.0: ; %entry | 
|  | ; GFX11-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24 | 
|  | ; GFX11-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1] | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1 | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v6 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x7ffff000, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x80000000, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    s_clause 0x3 | 
|  | ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off | 
|  | ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:2048 | 
|  | ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:3072 | 
|  | ; GFX11-NEXT:    global_load_b32 v3, v[4:5], off | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX11-NEXT:    v_add_nc_u32_e32 v0, v1, v0 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add3_u32 v0, v2, v0, v3 | 
|  | ; GFX11-NEXT:    global_store_b32 v6, v0, s[34:35] | 
|  | ; GFX11-NEXT:    s_endpgm | 
|  | entry: | 
|  | %call = tail call i64 @_Z13get_global_idj(i32 0) | 
|  | %conv = and i64 %call, 255 | 
|  | %a0 = shl i64 %call, 7 | 
|  | %idx.ext11 = and i64 %a0, 4294934528 | 
|  | %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 | 
|  |  | 
|  | %addr1 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr12, i64 %conv | 
|  | %load1 = load i32, ptr addrspace(1) %addr1, align 8 | 
|  |  | 
|  | %addr2 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870400 | 
|  | %load2 = load i32, ptr addrspace(1) %addr2, align 8 | 
|  |  | 
|  | %add1 = add i32 %load2, %load1 | 
|  |  | 
|  | %addr3 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870656 | 
|  | %load3 = load i32, ptr addrspace(1) %addr3, align 8 | 
|  |  | 
|  | %add2 = add i32 %load3, %add1 | 
|  |  | 
|  | %addr4 = getelementptr inbounds i32, ptr addrspace(1) %addr1, i64 536870912 | 
|  | %load4 = load i32, ptr addrspace(1) %addr4, align 8 | 
|  | %add4 = add i32 %load4, %add2 | 
|  |  | 
|  | store i32 %add4, ptr addrspace(1) %add.ptr12, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, | 
|  | ; GFX8-LABEL: DiffBase: | 
|  | ; GFX8:       ; %bb.0: ; %entry | 
|  | ; GFX8-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s50, -1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s51, 0xe80000 | 
|  | ; GFX8-NEXT:    s_add_u32 s48, s48, s11 | 
|  | ; GFX8-NEXT:    s_addc_u32 s49, s49, 0 | 
|  | ; GFX8-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX8-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x24 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[0:1], s[48:49] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[2:3], s[50:51] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX8-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0 | 
|  | ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff8000, v0 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v1, s37 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s36, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v3, s39 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s38, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x1000, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x1800, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x2000, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x2800, v12 | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[4:5] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7] | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x3000, v12 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v13, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[8:9] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 0x3800, v12 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[12:13], v[12:13] | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v7, v3, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v10, v8 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v12, v4 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v13, v5, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc | 
|  | ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] | 
|  | ; GFX8-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX9-LABEL: DiffBase: | 
|  | ; GFX9:       ; %bb.0: ; %entry | 
|  | ; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s50, -1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s51, 0xe00000 | 
|  | ; GFX9-NEXT:    s_add_u32 s48, s48, s11 | 
|  | ; GFX9-NEXT:    s_addc_u32 s49, s49, 0 | 
|  | ; GFX9-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x24 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49] | 
|  | ; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51] | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 7, v0 | 
|  | ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff8000, v0 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v0, s37 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s36, v16 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v0, vcc | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v0, s39 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s38, v16 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v0, vcc | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v2 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x2000, v2 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:2048 | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v10 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v11, vcc | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x3000, v10 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v11, vcc | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048 | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off offset:2048 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v10 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v13, v11, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v2 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v15, v3, vcc | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc | 
|  | ; GFX9-NEXT:    global_store_dwordx2 v16, v[0:1], s[36:37] | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: DiffBase: | 
|  | ; GFX10:       ; %bb.0: ; %entry | 
|  | ; GFX10-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX10-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s50, -1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s51, 0x31c16000 | 
|  | ; GFX10-NEXT:    s_add_u32 s48, s48, s11 | 
|  | ; GFX10-NEXT:    s_addc_u32 s49, s49, 0 | 
|  | ; GFX10-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0 | 
|  | ; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX10-NEXT:    s_mov_b64 s[0:1], s[48:49] | 
|  | ; GFX10-NEXT:    s_mov_b64 s[2:3], s[50:51] | 
|  | ; GFX10-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7] | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 7, v0 | 
|  | ; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff8000, v0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v8, s0, s36, v16 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s0, s37, 0, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v12, s0, s38, v16 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v13, s0, s39, 0, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, 0x1800 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v12, 0x3000 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v8 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:-2048 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v12 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[0:1], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v8 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v9, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo | 
|  | ; GFX10-NEXT:    global_store_dwordx2 v16, v[0:1], s[36:37] | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX11-LABEL: DiffBase: | 
|  | ; GFX11:       ; %bb.0: ; %entry | 
|  | ; GFX11-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 | 
|  | ; GFX11-NEXT:    s_load_b128 s[36:39], s[4:5], 0x24 | 
|  | ; GFX11-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1] | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 7, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff8000, v0 | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, s0, s36, v12 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s37, 0, s0 | 
|  | ; GFX11-NEXT:    v_add_co_u32 v8, s0, s38, v12 | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 0x2000 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, s39, 0, s0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x2000, v8 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v9, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x3000, v8 | 
|  | ; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off offset:-4096 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo | 
|  | ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048 | 
|  | ; GFX11-NEXT:    s_clause 0x1 | 
|  | ; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off offset:2048 | 
|  | ; GFX11-NEXT:    global_load_b64 v[10:11], v[8:9], off | 
|  | ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off | 
|  | ; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off offset:2048 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v6 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v7, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v10, v4 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, v11, v5, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v4 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v9, v5, vcc_lo | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo | 
|  | ; GFX11-NEXT:    global_store_b64 v12, v[0:1], s[36:37] | 
|  | ; GFX11-NEXT:    s_endpgm | 
|  | ptr addrspace(1) %buffer2) { | 
|  | entry: | 
|  | %call = tail call i64 @_Z13get_global_idj(i32 0) | 
|  | %conv = and i64 %call, 255 | 
|  | %a0 = shl i64 %call, 7 | 
|  | %idx.ext11 = and i64 %a0, 4294934528 | 
|  | %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer1, i64 %idx.ext11 | 
|  |  | 
|  | %add.ptr2 = getelementptr inbounds i8, ptr addrspace(1) %buffer2, i64 %idx.ext11 | 
|  |  | 
|  | %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 512 | 
|  | %load1 = load i64, ptr addrspace(1) %addr1, align 8 | 
|  | %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 768 | 
|  | %load2 = load i64, ptr addrspace(1) %add.ptr8.3, align 8 | 
|  | %add1 = add i64 %load2, %load1 | 
|  | %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 1024 | 
|  | %load3 = load i64, ptr addrspace(1) %add.ptr8.4, align 8 | 
|  | %add2 = add i64 %load3, %add1 | 
|  |  | 
|  | %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1280 | 
|  | %load4 = load i64, ptr addrspace(1) %add.ptr8.5, align 8 | 
|  |  | 
|  | %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1536 | 
|  | %load5 = load i64, ptr addrspace(1) %add.ptr8.6, align 8 | 
|  | %add3 = add i64 %load5, %load4 | 
|  |  | 
|  | %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr2, i64 1792 | 
|  | %load6 = load i64, ptr addrspace(1) %add.ptr8.7, align 8 | 
|  | %add4 = add i64 %load6, %add3 | 
|  |  | 
|  | %add5 = add i64 %add2, %add4 | 
|  |  | 
|  | store i64 %add5, ptr addrspace(1) %add.ptr12, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { | 
|  | ; GFX8-LABEL: ReverseOrder: | 
|  | ; GFX8:       ; %bb.0: ; %entry | 
|  | ; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s39, 0xe80000 | 
|  | ; GFX8-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX8-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX8-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX8-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v2, s35 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v3, 3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x3800 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x3000 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x2800 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[3:4] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10] | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x2000 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x1800 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16] | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x1000 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18] | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x800, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4] | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v11 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v12, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v13, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v15, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v17, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v18, v5, vcc | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc | 
|  | ; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4] | 
|  | ; GFX8-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX9-LABEL: ReverseOrder: | 
|  | ; GFX9:       ; %bb.0: ; %entry | 
|  | ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000 | 
|  | ; GFX9-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX9-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX9-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff8000, v1 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v1, s35 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v22 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v3, 3 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    s_movk_i32 s0, 0x3000 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:2048 | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off | 
|  | ; GFX9-NEXT:    s_movk_i32 s0, 0x2000 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:2048 | 
|  | ; GFX9-NEXT:    s_movk_i32 s0, 0x1000 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048 | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v20, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v21, v1, vcc | 
|  | ; GFX9-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35] | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: ReverseOrder: | 
|  | ; GFX10:       ; %bb.0: ; %entry | 
|  | ; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000 | 
|  | ; GFX10-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX10-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX10-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX10-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7] | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v2, 3 | 
|  | ; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1 | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x2800, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, 0x2000, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, 0x1800, v0 | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, 0x1000, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[14:15], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[0:1], off | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v6 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v18, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo | 
|  | ; GFX10-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35] | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX11-LABEL: ReverseOrder: | 
|  | ; GFX11:       ; %bb.0: ; %entry | 
|  | ; GFX11-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24 | 
|  | ; GFX11-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1] | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1 | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v16 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3000, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x2000, v0 | 
|  | ; GFX11-NEXT:    s_clause 0x2 | 
|  | ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off | 
|  | ; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off offset:2048 | 
|  | ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0x1000, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, v1, vcc_lo | 
|  | ; GFX11-NEXT:    s_clause 0x4 | 
|  | ; GFX11-NEXT:    global_load_b64 v[12:13], v[8:9], off offset:2048 | 
|  | ; GFX11-NEXT:    global_load_b64 v[14:15], v[10:11], off | 
|  | ; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off | 
|  | ; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:2048 | 
|  | ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048 | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, v7, v5, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v12, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v15, v3, vcc_lo | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo | 
|  | ; GFX11-NEXT:    global_store_b64 v16, v[0:1], s[34:35] | 
|  | ; GFX11-NEXT:    s_endpgm | 
|  | entry: | 
|  | %call = tail call i64 @_Z13get_global_idj(i32 0) | 
|  | %conv = and i64 %call, 255 | 
|  | %a0 = shl i64 %call, 7 | 
|  | %idx.ext11 = and i64 %a0, 4294934528 | 
|  | %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 | 
|  |  | 
|  | %addr1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv | 
|  | %load1 = load i64, ptr addrspace(1) %addr1, align 8 | 
|  |  | 
|  | %add.ptr8.7 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1792 | 
|  | %load8 = load i64, ptr addrspace(1) %add.ptr8.7, align 8 | 
|  | %add7 = add i64 %load8, %load1 | 
|  |  | 
|  | %add.ptr8.6 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1536 | 
|  | %load7 = load i64, ptr addrspace(1) %add.ptr8.6, align 8 | 
|  | %add6 = add i64 %load7, %add7 | 
|  |  | 
|  | %add.ptr8.5 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1280 | 
|  | %load6 = load i64, ptr addrspace(1) %add.ptr8.5, align 8 | 
|  | %add5 = add i64 %load6, %add6 | 
|  |  | 
|  | %add.ptr8.4 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 1024 | 
|  | %load5 = load i64, ptr addrspace(1) %add.ptr8.4, align 8 | 
|  | %add4 = add i64 %load5, %add5 | 
|  |  | 
|  | %add.ptr8.3 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 768 | 
|  | %load4 = load i64, ptr addrspace(1) %add.ptr8.3, align 8 | 
|  | %add3 = add i64 %load4, %add4 | 
|  |  | 
|  | %add.ptr8.2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 512 | 
|  | %load3 = load i64, ptr addrspace(1) %add.ptr8.2, align 8 | 
|  | %add2 = add i64 %load3, %add3 | 
|  |  | 
|  | %addr2 = getelementptr inbounds i64, ptr addrspace(1) %addr1, i64 256 | 
|  | %load2 = load i64, ptr addrspace(1) %addr2, align 8 | 
|  | %add1 = add i64 %load2, %add2 | 
|  |  | 
|  | store i64 %add1, ptr addrspace(1) %add.ptr12, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buffer) { | 
|  | ; GFX8-LABEL: negativeoffset: | 
|  | ; GFX8:       ; %bb.0: ; %entry | 
|  | ; GFX8-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX8-NEXT:    s_mov_b32 s39, 0xe80000 | 
|  | ; GFX8-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX8-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX8-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX8-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX8-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX8-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX8-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX8-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v2, s35 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v3, 3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc | 
|  | ; GFX8-NEXT:    s_movk_i32 s0, 0x800 | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v0, vcc | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v0 | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6] | 
|  | ; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4] | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5 | 
|  | ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc | 
|  | ; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4] | 
|  | ; GFX8-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX9-LABEL: negativeoffset: | 
|  | ; GFX9:       ; %bb.0: ; %entry | 
|  | ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000 | 
|  | ; GFX9-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX9-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX9-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
|  | ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX9-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5] | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v1, s35 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v8 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v3, 3 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x1000, v0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v1, vcc | 
|  | ; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1 | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:-2048 | 
|  | ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc | 
|  | ; GFX9-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35] | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: negativeoffset: | 
|  | ; GFX10:       ; %bb.0: ; %entry | 
|  | ; GFX10-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0 | 
|  | ; GFX10-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s38, -1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s39, 0x31c16000 | 
|  | ; GFX10-NEXT:    s_add_u32 s36, s36, s11 | 
|  | ; GFX10-NEXT:    s_addc_u32 s37, s37, 0 | 
|  | ; GFX10-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX10-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX10-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v31, v0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0 | 
|  | ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 | 
|  | ; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37] | 
|  | ; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39] | 
|  | ; GFX10-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7] | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v2, 3 | 
|  | ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1 | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v8 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v0 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo | 
|  | ; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1 | 
|  | ; GFX10-NEXT:    s_clause 0x1 | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off | 
|  | ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4 | 
|  | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo | 
|  | ; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35] | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX11-LABEL: negativeoffset: | 
|  | ; GFX11:       ; %bb.0: ; %entry | 
|  | ; GFX11-NEXT:    s_getpc_b64 s[0:1] | 
|  | ; GFX11-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 | 
|  | ; GFX11-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 | 
|  | ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 | 
|  | ; GFX11-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24 | 
|  | ; GFX11-NEXT:    s_mov_b32 s32, 0 | 
|  | ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1] | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0 | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) | 
|  | ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1 | 
|  | ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v4 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v0 | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo | 
|  | ; GFX11-NEXT:    v_add_nc_u32_e32 v1, -1, v1 | 
|  | ; GFX11-NEXT:    s_clause 0x1 | 
|  | ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:-2048 | 
|  | ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off | 
|  | ; GFX11-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2 | 
|  | ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo | 
|  | ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[34:35] | 
|  | ; GFX11-NEXT:    s_endpgm | 
|  | entry: | 
|  | %call = tail call i64 @_Z13get_global_idj(i32 0) #2 | 
|  | %conv = and i64 %call, 255 | 
|  | %0 = shl i64 %call, 7 | 
|  | %idx.ext11 = and i64 %0, 4294934528 | 
|  | %add.ptr12 = getelementptr inbounds i8, ptr addrspace(1) %buffer, i64 %idx.ext11 | 
|  |  | 
|  | %buffer_wave = getelementptr inbounds i64, ptr addrspace(1) %add.ptr12, i64 %conv | 
|  |  | 
|  | %addr1 = getelementptr inbounds i64, ptr addrspace(1) %buffer_wave, i64 -536870656 | 
|  | %load1 = load i64, ptr addrspace(1) %addr1, align 8 | 
|  |  | 
|  | %addr2 = getelementptr inbounds i64, ptr addrspace(1) %buffer_wave, i64 -536870912 | 
|  | %load2 = load i64, ptr addrspace(1) %addr2, align 8 | 
|  |  | 
|  |  | 
|  | %add = add i64 %load2, %load1 | 
|  |  | 
|  | store i64 %add, ptr addrspace(1) %add.ptr12, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { | 
|  | ; GFX8-LABEL: negativeoffsetnullptr: | 
|  | ; GFX8:       ; %bb.0: ; %entry | 
|  | ; GFX8-NEXT:    s_load_dword s1, s[4:5], 0xec | 
|  | ; GFX8-NEXT:    s_add_u32 s0, 0, -1 | 
|  | ; GFX8-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX8-NEXT:    s_addc_u32 s1, s1, -1 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v0, s0 | 
|  | ; GFX8-NEXT:    v_mov_b32_e32 v1, s1 | 
|  | ; GFX8-NEXT:    flat_load_ubyte v0, v[0:1] | 
|  | ; GFX8-NEXT:    s_mov_b64 s[0:1], 0 | 
|  | ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
|  | ; GFX8-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0 | 
|  | ; GFX8-NEXT:  .LBB8_1: ; %branch | 
|  | ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1 | 
|  | ; GFX8-NEXT:    s_and_b64 s[2:3], exec, vcc | 
|  | ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1] | 
|  | ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1] | 
|  | ; GFX8-NEXT:    s_cbranch_execnz .LBB8_1 | 
|  | ; GFX8-NEXT:  ; %bb.2: ; %end | 
|  | ; GFX8-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX9-LABEL: negativeoffsetnullptr: | 
|  | ; GFX9:       ; %bb.0: ; %entry | 
|  | ; GFX9-NEXT:    s_mov_b64 s[0:1], src_private_base | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v1, s1 | 
|  | ; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, -1, 0 | 
|  | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc | 
|  | ; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] | 
|  | ; GFX9-NEXT:    s_mov_b64 s[0:1], 0 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
|  | ; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0 | 
|  | ; GFX9-NEXT:  .LBB8_1: ; %branch | 
|  | ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1 | 
|  | ; GFX9-NEXT:    s_and_b64 s[2:3], exec, vcc | 
|  | ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1] | 
|  | ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1] | 
|  | ; GFX9-NEXT:    s_cbranch_execnz .LBB8_1 | 
|  | ; GFX9-NEXT:  ; %bb.2: ; %end | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: negativeoffsetnullptr: | 
|  | ; GFX10:       ; %bb.0: ; %entry | 
|  | ; GFX10-NEXT:    s_mov_b64 s[0:1], src_private_base | 
|  | ; GFX10-NEXT:    s_add_u32 s0, 0, -1 | 
|  | ; GFX10-NEXT:    s_addc_u32 s1, s1, -1 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v0, s0 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v1, s1 | 
|  | ; GFX10-NEXT:    s_mov_b32 s0, 0 | 
|  | ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1] | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
|  | ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0 | 
|  | ; GFX10-NEXT:  .LBB8_1: ; %branch | 
|  | ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1 | 
|  | ; GFX10-NEXT:    s_and_b32 s1, exec_lo, vcc_lo | 
|  | ; GFX10-NEXT:    s_or_b32 s0, s1, s0 | 
|  | ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0 | 
|  | ; GFX10-NEXT:    s_cbranch_execnz .LBB8_1 | 
|  | ; GFX10-NEXT:  ; %bb.2: ; %end | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX11-TRUE16-LABEL: negativeoffsetnullptr: | 
|  | ; GFX11-TRUE16:       ; %bb.0: ; %entry | 
|  | ; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], src_private_base | 
|  | ; GFX11-TRUE16-NEXT:    v_add_co_u32 v0, s0, -1, 0 | 
|  | ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0 | 
|  | ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0 | 
|  | ; GFX11-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] | 
|  | ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
|  | ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l | 
|  | ; GFX11-TRUE16-NEXT:  .LBB8_1: ; %branch | 
|  | ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1 | 
|  | ; GFX11-TRUE16-NEXT:    s_and_b32 s1, exec_lo, vcc_lo | 
|  | ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) | 
|  | ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0 | 
|  | ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0 | 
|  | ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1 | 
|  | ; GFX11-TRUE16-NEXT:  ; %bb.2: ; %end | 
|  | ; GFX11-TRUE16-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX11-FAKE16-LABEL: negativeoffsetnullptr: | 
|  | ; GFX11-FAKE16:       ; %bb.0: ; %entry | 
|  | ; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], src_private_base | 
|  | ; GFX11-FAKE16-NEXT:    v_add_co_u32 v0, s0, -1, 0 | 
|  | ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | 
|  | ; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0 | 
|  | ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0 | 
|  | ; GFX11-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] | 
|  | ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
|  | ; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0 | 
|  | ; GFX11-FAKE16-NEXT:  .LBB8_1: ; %branch | 
|  | ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1 | 
|  | ; GFX11-FAKE16-NEXT:    s_and_b32 s1, exec_lo, vcc_lo | 
|  | ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) | 
|  | ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0 | 
|  | ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0 | 
|  | ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB8_1 | 
|  | ; GFX11-FAKE16-NEXT:  ; %bb.2: ; %end | 
|  | ; GFX11-FAKE16-NEXT:    s_endpgm | 
|  | entry: | 
|  | %null = select i1 false, ptr %buffer, ptr addrspacecast (ptr addrspace(5) null to ptr) | 
|  | %gep = getelementptr i8, ptr %null, i64 -1 | 
|  | %ld = load i8, ptr %gep | 
|  | %cmp = icmp eq i8 %ld, 0 | 
|  | br label %branch | 
|  |  | 
|  | branch: | 
|  | br i1 %cmp, label %end, label %branch | 
|  |  | 
|  | end: | 
|  | ret void | 
|  | } | 
|  |  | 
|  |  | 
|  | attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } |