|  | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
|  | ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefix=MUBUF %s | 
|  | ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=FLATSCR %s | 
|  |  | 
|  | ; Test that the VGPR spiller correctly switches to SGPR offsets when the | 
|  | ; instruction offset field would overflow, and that it accounts for memory | 
|  | ; swizzling. | 
|  |  | 
|  | define amdgpu_kernel void @test_inst_offset_kernel() { | 
|  | ; MUBUF-LABEL: test_inst_offset_kernel: | 
|  | ; MUBUF:       ; %bb.0: ; %entry | 
|  | ; MUBUF-NEXT:    s_add_u32 s0, s0, s17 | 
|  | ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0 | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4 | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    s_endpgm | 
|  | ; | 
|  | ; FLATSCR-LABEL: test_inst_offset_kernel: | 
|  | ; FLATSCR:       ; %bb.0: ; %entry | 
|  | ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13 | 
|  | ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0 | 
|  | ; FLATSCR-NEXT:    s_mov_b32 s0, 0 | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_movk_i32 s0, 0xff8 | 
|  | ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload | 
|  | ; FLATSCR-NEXT:    s_mov_b32 s0, 0 | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 offset:4 | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_endpgm | 
|  | entry: | 
|  | ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in | 
|  | ; the instruction offset field. | 
|  | %alloca = alloca i8, i32 4088, align 4, addrspace(5) | 
|  |  | 
|  | %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  |  | 
|  |  | 
|  | %a = load volatile i32, ptr addrspace(5) %aptr | 
|  |  | 
|  | ; Force %a to spill. | 
|  | call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () | 
|  |  | 
|  | %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  | store volatile i32 %a, ptr addrspace(5) %outptr | 
|  |  | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @test_sgpr_offset_kernel() { | 
|  | ; MUBUF-LABEL: test_sgpr_offset_kernel: | 
|  | ; MUBUF:       ; %bb.0: ; %entry | 
|  | ; MUBUF-NEXT:    s_add_u32 s0, s0, s17 | 
|  | ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0 | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    s_mov_b32 s4, 0x40000 | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8 | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    s_endpgm | 
|  | ; | 
|  | ; FLATSCR-LABEL: test_sgpr_offset_kernel: | 
|  | ; FLATSCR:       ; %bb.0: ; %entry | 
|  | ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13 | 
|  | ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0 | 
|  | ; FLATSCR-NEXT:    s_mov_b32 s0, 0 | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_movk_i32 s0, 0x1000 | 
|  | ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload | 
|  | ; FLATSCR-NEXT:    s_mov_b32 s0, 0 | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 offset:8 | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_endpgm | 
|  | entry: | 
|  | ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not | 
|  | ; fit in the instruction, and has to live in the SGPR offset. | 
|  | %alloca = alloca i8, i32 4092, align 4, addrspace(5) | 
|  |  | 
|  | %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  | ; 0x40000 / 64 = 4096 (for wave64) | 
|  | %a = load volatile i32, ptr addrspace(5) %aptr | 
|  | ; Force %a to spill | 
|  | call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () | 
|  |  | 
|  | %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  | store volatile i32 %a, ptr addrspace(5) %outptr | 
|  |  | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define void @test_sgpr_offset_function_scavenge_fail_func() #2 { | 
|  | ; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_func: | 
|  | ; MUBUF:       ; %bb.0: ; %entry | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    s_add_i32 s10, s32, 0x40100 | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    s_add_i32 s10, s32, 0x40100 | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    s_setpc_b64 s[30:31] | 
|  | ; | 
|  | ; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_func: | 
|  | ; FLATSCR:       ; %bb.0: ; %entry | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:8 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_add_i32 s8, s32, 0x1004 | 
|  | ; FLATSCR-NEXT:    scratch_store_dword off, v0, s8 ; 4-byte Folded Spill | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    s_add_i32 s8, s32, 0x1004 | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s8 ; 4-byte Folded Reload | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    s_setpc_b64 s[30:31] | 
|  | entry: | 
|  | ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not | 
|  | ; fit in the instruction, and has to live in the SGPR offset. | 
|  | %alloca = alloca i8, i32 4096, align 4, addrspace(5) | 
|  |  | 
|  | %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  |  | 
|  | %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() | 
|  | %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0 | 
|  | %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1 | 
|  | %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2 | 
|  | %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3 | 
|  | %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4 | 
|  | %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5 | 
|  | %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6 | 
|  | %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7 | 
|  |  | 
|  | ; 0x40000 / 64 = 4096 (for wave64) | 
|  | %a = load volatile i32, ptr addrspace(5) %aptr | 
|  | call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) | 
|  |  | 
|  | %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() | 
|  | %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 | 
|  | %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 | 
|  | %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 | 
|  | %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 | 
|  | %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 | 
|  | %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 | 
|  | %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 | 
|  | %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 | 
|  |  | 
|  | call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 | 
|  | ; Force %a to spill with no free SGPRs | 
|  | call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { | 
|  | ; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: | 
|  | ; MUBUF:       ; %bb.0: ; %entry | 
|  | ; MUBUF-NEXT:    s_add_u32 s0, s0, s17 | 
|  | ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0 | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    s_mov_b32 s10, 0x40100 | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    s_endpgm | 
|  | ; | 
|  | ; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: | 
|  | ; FLATSCR:       ; %bb.0: ; %entry | 
|  | ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13 | 
|  | ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0 | 
|  | ; FLATSCR-NEXT:    s_mov_b32 s8, 0 | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s8 offset:8 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_movk_i32 s8, 0x1004 | 
|  | ; FLATSCR-NEXT:    scratch_store_dword off, v0, s8 ; 4-byte Folded Spill | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s8 ; 4-byte Folded Reload | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    s_endpgm | 
|  | entry: | 
|  | ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not | 
|  | ; fit in the instruction, and has to live in the SGPR offset. | 
|  | %alloca = alloca i8, i32 4096, align 4, addrspace(5) | 
|  |  | 
|  | %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  |  | 
|  | %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() | 
|  | %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0 | 
|  | %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1 | 
|  | %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2 | 
|  | %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3 | 
|  | %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4 | 
|  | %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5 | 
|  | %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6 | 
|  | %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7 | 
|  |  | 
|  | ; 0x40000 / 64 = 4096 (for wave64) | 
|  | %a = load volatile i32, ptr addrspace(5) %aptr | 
|  | call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) | 
|  |  | 
|  | %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() | 
|  | %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 | 
|  | %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 | 
|  | %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 | 
|  | %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 | 
|  | %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 | 
|  | %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 | 
|  | %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 | 
|  | %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 | 
|  |  | 
|  | call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 | 
|  | ; Force %a to spill with no free SGPRs | 
|  | call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { | 
|  | ; MUBUF-LABEL: test_sgpr_offset_subregs_kernel: | 
|  | ; MUBUF:       ; %bb.0: ; %entry | 
|  | ; MUBUF-NEXT:    s_add_u32 s0, s0, s17 | 
|  | ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0 | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    s_nop 0 | 
|  | ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ; v[0:1] | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    s_endpgm | 
|  | ; | 
|  | ; FLATSCR-LABEL: test_sgpr_offset_subregs_kernel: | 
|  | ; FLATSCR:       ; %bb.0: ; %entry | 
|  | ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13 | 
|  | ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0 | 
|  | ; FLATSCR-NEXT:    s_mov_b32 s0, 0 | 
|  | ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_movk_i32 s0, 0xff4 | 
|  | ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill | 
|  | ; FLATSCR-NEXT:    s_mov_b32 s0, 0 | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_movk_i32 s0, 0xff4 | 
|  | ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ; v[0:1] | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    s_endpgm | 
|  | entry: | 
|  | ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a | 
|  | ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in | 
|  | ; the instruction offset field. | 
|  | %alloca = alloca i8, i32 4084, align 4, addrspace(5) | 
|  | %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1 | 
|  | %a = load volatile <2 x i32>, ptr addrspace(5) %aptr | 
|  |  | 
|  | ; Force %a to spill. | 
|  | call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () | 
|  |  | 
|  | ; Ensure the alloca sticks around. | 
|  | %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  | %b = load volatile i32, ptr addrspace(5) %bptr | 
|  |  | 
|  | ; Ensure the spill is of the full super-reg. | 
|  | call void asm sideeffect "; $0", "r"(<2 x i32> %a) | 
|  |  | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @test_inst_offset_subregs_kernel() { | 
|  | ; MUBUF-LABEL: test_inst_offset_subregs_kernel: | 
|  | ; MUBUF:       ; %bb.0: ; %entry | 
|  | ; MUBUF-NEXT:    s_add_u32 s0, s0, s17 | 
|  | ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0 | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    s_mov_b32 s4, 0x3ff00 | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    s_nop 0 | 
|  | ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ; v[0:1] | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    s_endpgm | 
|  | ; | 
|  | ; FLATSCR-LABEL: test_inst_offset_subregs_kernel: | 
|  | ; FLATSCR:       ; %bb.0: ; %entry | 
|  | ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13 | 
|  | ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0 | 
|  | ; FLATSCR-NEXT:    s_mov_b32 s0, 0 | 
|  | ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_movk_i32 s0, 0xffc | 
|  | ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill | 
|  | ; FLATSCR-NEXT:    s_mov_b32 s0, 0 | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_movk_i32 s0, 0xffc | 
|  | ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ; v[0:1] | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    s_endpgm | 
|  | entry: | 
|  | ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a | 
|  | ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live | 
|  | ; in the SGPR offset. | 
|  | %alloca = alloca i8, i32 4088, align 4, addrspace(5) | 
|  |  | 
|  | ; 0x3ff00 / 64 = 4092 (for wave64) | 
|  | %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1 | 
|  | %a = load volatile <2 x i32>, ptr addrspace(5) %aptr | 
|  |  | 
|  | ; Force %a to spill. | 
|  | call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () | 
|  |  | 
|  | ; Ensure the alloca sticks around. | 
|  | %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  | %b = load volatile i32, ptr addrspace(5) %bptr | 
|  |  | 
|  | ; Ensure the spill is of the full super-reg. | 
|  | call void asm sideeffect "; $0", "r"(<2 x i32> %a) | 
|  |  | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define void @test_inst_offset_function() { | 
|  | ; MUBUF-LABEL: test_inst_offset_function: | 
|  | ; MUBUF:       ; %bb.0: ; %entry | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    s_setpc_b64 s[30:31] | 
|  | ; | 
|  | ; FLATSCR-LABEL: test_inst_offset_function: | 
|  | ; FLATSCR:       ; %bb.0: ; %entry | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4088 ; 4-byte Folded Spill | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4088 ; 4-byte Folded Reload | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4 | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_setpc_b64 s[30:31] | 
|  | entry: | 
|  | ; Occupy enough bytes of scratch, so the offset of the spill of %a | 
|  | ; just fits in the instruction offset field when the emergency stack | 
|  | ; slot is added. It's hard to hit the actual limit since we're also | 
|  | ; going to insert the emergency stack slot for large frames. | 
|  | %alloca = alloca i8, i32 4088, align 4, addrspace(5) | 
|  |  | 
|  | %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  |  | 
|  |  | 
|  | %a = load volatile i32, ptr addrspace(5) %aptr | 
|  |  | 
|  | ; Force %a to spill. | 
|  | call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () | 
|  |  | 
|  | %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  | store volatile i32 %a, ptr addrspace(5) %outptr | 
|  |  | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define void @test_sgpr_offset_function() { | 
|  | ; MUBUF-LABEL: test_sgpr_offset_function: | 
|  | ; MUBUF:       ; %bb.0: ; %entry | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    s_add_i32 s4, s32, 0x40100 | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    s_add_i32 s4, s32, 0x40100 | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    s_setpc_b64 s[30:31] | 
|  | ; | 
|  | ; FLATSCR-LABEL: test_sgpr_offset_function: | 
|  | ; FLATSCR:       ; %bb.0: ; %entry | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:8 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_add_i32 s0, s32, 0x1004 | 
|  | ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    s_add_i32 s0, s32, 0x1004 | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:8 | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    s_setpc_b64 s[30:31] | 
|  | entry: | 
|  | ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not | 
|  | ; fit in the instruction, and has to live in the SGPR offset. | 
|  | %alloca = alloca i8, i32 4096, align 4, addrspace(5) | 
|  |  | 
|  | %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  | ; 0x40000 / 64 = 4096 (for wave64) | 
|  | %a = load volatile i32, ptr addrspace(5) %aptr | 
|  |  | 
|  | ; Force %a to spill | 
|  | call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () | 
|  |  | 
|  | %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  | store volatile i32 %a, ptr addrspace(5) %outptr | 
|  |  | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define void @test_sgpr_offset_subregs_function() { | 
|  | ; MUBUF-LABEL: test_sgpr_offset_subregs_function: | 
|  | ; MUBUF:       ; %bb.0: ; %entry | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    s_nop 0 | 
|  | ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ; v[0:1] | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    s_setpc_b64 s[30:31] | 
|  | ; | 
|  | ; FLATSCR-LABEL: test_sgpr_offset_subregs_function: | 
|  | ; FLATSCR:       ; %bb.0: ; %entry | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:8 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:4084 ; 8-byte Folded Spill | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:4084 ; 8-byte Folded Reload | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ; v[0:1] | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    s_setpc_b64 s[30:31] | 
|  | entry: | 
|  | ; We want to test the spill of the last subreg of %a is the highest | 
|  | ; valid value for the immediate offset. We enable the emergency | 
|  | ; stack slot for large frames, so it's hard to get the frame layout | 
|  | ; exactly as we want to test it. | 
|  | ; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a | 
|  | ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in | 
|  | ; the instruction offset field. | 
|  | %alloca = alloca i8, i32 4084, align 4, addrspace(5) | 
|  | %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1 | 
|  | %a = load volatile <2 x i32>, ptr addrspace(5) %aptr | 
|  |  | 
|  | ; Force %a to spill. | 
|  | call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () | 
|  |  | 
|  | ; Ensure the alloca sticks around. | 
|  | %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  | %b = load volatile i32, ptr addrspace(5) %bptr | 
|  |  | 
|  | ; Ensure the spill is of the full super-reg. | 
|  | call void asm sideeffect "; $0", "r"(<2 x i32> %a) | 
|  |  | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define void @test_inst_offset_subregs_function() { | 
|  | ; MUBUF-LABEL: test_inst_offset_subregs_function: | 
|  | ; MUBUF:       ; %bb.0: ; %entry | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    s_add_i32 s4, s32, 0x3ff00 | 
|  | ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    s_nop 0 | 
|  | ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    s_add_i32 s4, s32, 0x3ff00 | 
|  | ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload | 
|  | ; MUBUF-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; MUBUF-NEXT:    ;;#ASMSTART | 
|  | ; MUBUF-NEXT:    ; v[0:1] | 
|  | ; MUBUF-NEXT:    ;;#ASMEND | 
|  | ; MUBUF-NEXT:    s_setpc_b64 s[30:31] | 
|  | ; | 
|  | ; FLATSCR-LABEL: test_inst_offset_subregs_function: | 
|  | ; FLATSCR:       ; %bb.0: ; %entry | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:12 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:4092 ; 8-byte Folded Spill | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:8 glc | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:4092 ; 8-byte Folded Reload | 
|  | ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; FLATSCR-NEXT:    ;;#ASMSTART | 
|  | ; FLATSCR-NEXT:    ; v[0:1] | 
|  | ; FLATSCR-NEXT:    ;;#ASMEND | 
|  | ; FLATSCR-NEXT:    s_setpc_b64 s[30:31] | 
|  | entry: | 
|  | ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a | 
|  | ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live | 
|  | ; in the SGPR offset. | 
|  | %alloca = alloca i8, i32 4088, align 4, addrspace(5) | 
|  |  | 
|  | ; 0x3ff0000 / 64 = 4092 (for wave64) | 
|  | %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1 | 
|  | %a = load volatile <2 x i32>, ptr addrspace(5) %aptr | 
|  |  | 
|  | ; Force %a to spill. | 
|  | call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () | 
|  |  | 
|  | ; Ensure the alloca sticks around. | 
|  | %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 | 
|  | %b = load volatile i32, ptr addrspace(5) %bptr | 
|  |  | 
|  | ; Ensure the spill is of the full super-reg. | 
|  | call void asm sideeffect "; $0", "r"(<2 x i32> %a) | 
|  |  | 
|  | ret void | 
|  | } | 
|  |  | 
|  | attributes #0 = { nounwind } | 
|  | attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" } | 
|  | attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } | 
|  | attributes #3 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } |