blob: a5edc2ea19362802ff09a752fe8af348f8cdd5a5 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Disabled endcf collapse at -O0.
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -O0 -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN-O0 %s
; Note: Breaking large PHIs is disabled to branches from being eliminated (in scc_liveness)
define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: simple_nested_if:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB0_3
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_and_b64 exec, exec, vcc
; GCN-NEXT: s_cbranch_execz .LBB0_3
; GCN-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: v_mov_b32_e32 v2, 1
; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4
; GCN-NEXT: .LBB0_3: ; %bb.outer.end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: simple_nested_if:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s10, -1
; GCN-O0-NEXT: s_mov_b32 s11, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s8, s8, s3
; GCN-O0-NEXT: s_addc_u32 s9, s9, 0
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB0_4
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v0
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB0_3
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s2, 2
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: .LBB0_3: ; %Flow
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v0, v2
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = icmp ugt i32 %tmp, 1
br i1 %tmp1, label %bb.outer.then, label %bb.outer.end
bb.outer.then: ; preds = %bb
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp4, align 4
%tmp5 = icmp eq i32 %tmp, 2
br i1 %tmp5, label %bb.outer.end, label %bb.inner.then
bb.inner.then: ; preds = %bb.outer.then
%tmp7 = add i32 %tmp, 1
%tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
store i32 1, ptr addrspace(1) %tmp9, align 4
br label %bb.outer.end
bb.outer.end: ; preds = %bb.outer.then, %bb.inner.then, %bb
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: uncollapsable_nested_if:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB1_4
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB1_3
; GCN-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:4
; GCN-NEXT: .LBB1_3: ; %bb.inner.end
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 2
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GCN-NEXT: .LBB1_4: ; %Flow
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: uncollapsable_nested_if:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s10, -1
; GCN-O0-NEXT: s_mov_b32 s11, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s8, s8, s3
; GCN-O0-NEXT: s_addc_u32 s9, s9, 0
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB1_3
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v0
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB1_4
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s2, 2
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB1_4
; GCN-O0-NEXT: .LBB1_3: ; %Flow
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_branch .LBB1_5
; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s2, v1, 4
; GCN-O0-NEXT: v_readlane_b32 s3, v1, 5
; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], v0
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB1_3
; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v0, v2
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = icmp ugt i32 %tmp, 1
br i1 %tmp1, label %bb.outer.then, label %bb.outer.end
bb.outer.then: ; preds = %bb
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp4, align 4
%tmp5 = icmp eq i32 %tmp, 2
br i1 %tmp5, label %bb.inner.end, label %bb.inner.then
bb.inner.then: ; preds = %bb.outer.then
%tmp7 = add i32 %tmp, 1
%tmp8 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
store i32 1, ptr addrspace(1) %tmp8, align 4
br label %bb.inner.end
bb.inner.end: ; preds = %bb.inner.then, %bb.outer.then
%tmp9 = add i32 %tmp, 2
%tmp10 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp9
store i32 2, ptr addrspace(1) %tmp10, align 4
br label %bb.outer.end
bb.outer.end: ; preds = %bb.inner.then, %bb
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: nested_if_if_else:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GCN-NEXT: s_cbranch_execz .LBB2_5
; GCN-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-NEXT: v_mov_b32_e32 v4, s1
; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v1
; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB2_3
; GCN-NEXT: ; %bb.2: ; %bb.else
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mov_b32_e32 v0, 2
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:8
; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4
; GCN-NEXT: .LBB2_3: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB2_5
; GCN-NEXT: ; %bb.4: ; %bb.then
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:4
; GCN-NEXT: .LBB2_5: ; %bb.outer.end
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v2, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: nested_if_if_else:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s10, -1
; GCN-O0-NEXT: s_mov_b32 s11, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s8, s8, s3
; GCN-O0-NEXT: s_addc_u32 s9, s9, 0
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1]
; GCN-O0-NEXT: v_writelane_b32 v1, s2, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s3, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s4, 2
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s4
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB2_6
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
; GCN-O0-NEXT: v_writelane_b32 v1, s2, 4
; GCN-O0-NEXT: v_writelane_b32 v1, s3, 5
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB2_2
; GCN-O0-NEXT: s_branch .LBB2_4
; GCN-O0-NEXT: .LBB2_2: ; %Flow
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5
; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 6
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 7
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB2_5
; GCN-O0-NEXT: ; %bb.3: ; %bb.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s2, 2
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB2_5
; GCN-O0-NEXT: .LBB2_4: ; %bb.else
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], v0
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB2_2
; GCN-O0-NEXT: .LBB2_5: ; %Flow1
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v0, v2
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp1, align 4
%tmp2 = icmp ugt i32 %tmp, 1
br i1 %tmp2, label %bb.outer.then, label %bb.outer.end
bb.outer.then: ; preds = %bb
%tmp5 = icmp eq i32 %tmp, 2
br i1 %tmp5, label %bb.then, label %bb.else
bb.then: ; preds = %bb.outer.then
%tmp3 = add i32 %tmp, 1
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp3
store i32 1, ptr addrspace(1) %tmp4, align 4
br label %bb.outer.end
bb.else: ; preds = %bb.outer.then
%tmp7 = add i32 %tmp, 2
%tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
store i32 2, ptr addrspace(1) %tmp9, align 4
br label %bb.outer.end
bb.outer.end: ; preds = %bb, %bb.then, %bb.else
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: nested_if_else_if:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 2, v0
; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB3_4
; GCN-NEXT: ; %bb.1: ; %bb.outer.else
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: v_mov_b32_e32 v3, 3
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:12
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT: s_cbranch_execz .LBB3_3
; GCN-NEXT: ; %bb.2: ; %bb.inner.then2
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s8, s10
; GCN-NEXT: s_mov_b32 s9, s10
; GCN-NEXT: v_mov_b32_e32 v0, 4
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 offset:16
; GCN-NEXT: .LBB3_3: ; %Flow
; GCN-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: .LBB3_4: ; %Flow2
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB3_8
; GCN-NEXT: ; %bb.5: ; %bb.outer.then
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, 1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:4
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_cbranch_execz .LBB3_7
; GCN-NEXT: ; %bb.6: ; %bb.inner.then
; GCN-NEXT: v_mov_b32_e32 v0, 2
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GCN-NEXT: .LBB3_7: ; %Flow1
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: .LBB3_8: ; %bb.outer.end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 3
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: nested_if_else_if:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s10, -1
; GCN-O0-NEXT: s_mov_b32 s11, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s8, s8, s3
; GCN-O0-NEXT: s_addc_u32 s9, s9, 0
; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: v_mov_b32_e32 v4, 0
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v3, v4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: s_mov_b32 s1, s0
; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s1
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v6, v4
; GCN-O0-NEXT: v_add_i32_e64 v5, s[2:3], s2, v2
; GCN-O0-NEXT: v_mov_b32_e32 v2, s1
; GCN-O0-NEXT: v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3]
; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v6, v2
; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-O0-NEXT: v_mov_b32_e32 v2, 0
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64
; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
; GCN-O0-NEXT: v_writelane_b32 v1, s2, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s3, 1
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_1
; GCN-O0-NEXT: s_branch .LBB3_4
; GCN-O0-NEXT: .LBB3_1: ; %Flow2
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_8
; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: v_mov_b32_e32 v2, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_7
; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:8
; GCN-O0-NEXT: s_branch .LBB3_7
; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else
; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s4, s0
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 offset:12
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 6
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 7
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_6
; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
; GCN-O0-NEXT: s_mov_b32 s5, s0
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, 4
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:16
; GCN-O0-NEXT: .LBB3_6: ; %Flow
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_branch .LBB3_1
; GCN-O0-NEXT: .LBB3_7: ; %Flow1
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, 3
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
; GCN-O0-NEXT: s_mov_b32 m0, -1
; GCN-O0-NEXT: ds_write_b32 v0, v2
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp1, align 4
%cc1 = icmp ugt i32 %tmp, 1
br i1 %cc1, label %bb.outer.then, label %bb.outer.else
bb.outer.then:
%tmp2 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 1
store i32 1, ptr addrspace(1) %tmp2, align 4
%cc2 = icmp eq i32 %tmp, 2
br i1 %cc2, label %bb.inner.then, label %bb.outer.end
bb.inner.then:
%tmp3 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 2
store i32 2, ptr addrspace(1) %tmp3, align 4
br label %bb.outer.end
bb.outer.else:
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 3
store i32 3, ptr addrspace(1) %tmp4, align 4
%cc3 = icmp eq i32 %tmp, 0 ; avoid being optimized away through the domination
br i1 %cc3, label %bb.inner.then2, label %bb.outer.end
bb.inner.then2:
%tmp5 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 4
store i32 4, ptr addrspace(1) %tmp5, align 4
br label %bb.outer.end
bb.outer.end:
store i32 3, ptr addrspace(3) null
ret void
}
define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: s_endpgm_unsafe_barrier:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GCN-NEXT: s_cbranch_execz .LBB4_2
; GCN-NEXT: ; %bb.1: ; %bb.then
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v1, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: .LBB4_2: ; %bb.end
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_barrier
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: s_endpgm_unsafe_barrier:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s10, -1
; GCN-O0-NEXT: s_mov_b32 s11, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s8, s8, s3
; GCN-O0-NEXT: s_addc_u32 s9, s9, 0
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1
; GCN-O0-NEXT: v_mov_b32_e32 v2, v0
; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 1
; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB4_2
; GCN-O0-NEXT: ; %bb.1: ; %bb.then
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GCN-O0-NEXT: s_mov_b32 s5, s2
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_ashrrev_i32_e64 v0, 31, v2
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v0
; GCN-O0-NEXT: s_mov_b32 s4, 2
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0
; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
; GCN-O0-NEXT: .LBB4_2: ; %bb.end
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: s_barrier
; GCN-O0-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = icmp ugt i32 %tmp, 1
br i1 %tmp1, label %bb.then, label %bb.end
bb.then: ; preds = %bb
%tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
store i32 0, ptr addrspace(1) %tmp4, align 4
br label %bb.end
bb.end: ; preds = %bb.then, %bb
call void @llvm.amdgcn.s.barrier()
ret void
}
define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-LABEL: scc_liveness:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_movk_i32 s4, 0x207
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GCN-NEXT: s_mov_b32 s8, 0
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GCN-NEXT: s_mov_b64 s[12:13], 0
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_branch .LBB5_3
; GCN-NEXT: .LBB5_1: ; %Flow
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN-NEXT: .LBB5_2: ; %bb10
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[14:15]
; GCN-NEXT: s_and_b64 s[6:7], exec, s[4:5]
; GCN-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13]
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_andn2_b64 exec, exec, s[12:13]
; GCN-NEXT: s_cbranch_execz .LBB5_7
; GCN-NEXT: .LBB5_3: ; %bb1
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_and_b64 s[10:11], exec, vcc
; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GCN-NEXT: s_cbranch_execnz .LBB5_3
; GCN-NEXT: ; %bb.4: ; %bb2
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_mov_b32 s10, s8
; GCN-NEXT: s_mov_b32 s11, s8
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: s_and_saveexec_b64 s[14:15], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB5_2
; GCN-NEXT: ; %bb.5: ; %bb4
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[6:7]
; GCN-NEXT: s_cbranch_execz .LBB5_1
; GCN-NEXT: ; %bb.6: ; %bb8
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: s_branch .LBB5_1
; GCN-NEXT: .LBB5_7: ; %bb12
; GCN-NEXT: s_or_b64 exec, exec, s[12:13]
; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: scc_liveness:
; GCN-O0: ; %bb.0: ; %bb
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], 0
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: s_waitcnt expcnt(1)
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 1
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 3
; GCN-O0-NEXT: .LBB5_1: ; %bb1
; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s8, v1, 2
; GCN-O0-NEXT: v_readlane_b32 s9, v1, 3
; GCN-O0-NEXT: v_readlane_b32 s6, v1, 0
; GCN-O0-NEXT: v_readlane_b32 s7, v1, 1
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 4
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 5
; GCN-O0-NEXT: s_mov_b32 s4, 0x207
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v0, s4
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 6
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 7
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 1
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 3
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
; GCN-O0-NEXT: ; %bb.2: ; %bb2
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 6
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 7
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_mov_b32 s6, 0
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s6
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 8
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 9
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: s_mov_b32 s8, s4
; GCN-O0-NEXT: s_mov_b32 s9, s4
; GCN-O0-NEXT: s_mov_b32 s10, s4
; GCN-O0-NEXT: s_mov_b32 s11, s4
; GCN-O0-NEXT: v_mov_b32_e32 v2, s8
; GCN-O0-NEXT: v_mov_b32_e32 v3, s9
; GCN-O0-NEXT: v_mov_b32_e32 v4, s10
; GCN-O0-NEXT: v_mov_b32_e32 v5, s11
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 10
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 11
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execz .LBB5_5
; GCN-O0-NEXT: ; %bb.3: ; %bb4
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v0, s4
; GCN-O0-NEXT: s_mov_b32 s8, s4
; GCN-O0-NEXT: s_mov_b32 s9, s4
; GCN-O0-NEXT: s_mov_b32 s10, s4
; GCN-O0-NEXT: s_mov_b32 s11, s4
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, s8
; GCN-O0-NEXT: v_mov_b32_e32 v3, s9
; GCN-O0-NEXT: v_mov_b32_e32 v4, s10
; GCN-O0-NEXT: v_mov_b32_e32 v5, s11
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 12
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 13
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execz .LBB5_6
; GCN-O0-NEXT: ; %bb.4: ; %bb8
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_mov_b32 s10, 0
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: ; implicit-def: $sgpr5
; GCN-O0-NEXT: ; implicit-def: $sgpr9
; GCN-O0-NEXT: ; implicit-def: $sgpr5
; GCN-O0-NEXT: ; implicit-def: $sgpr8
; GCN-O0-NEXT: ; implicit-def: $sgpr5
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b32 s5, s10
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: v_mov_b32_e32 v3, s5
; GCN-O0-NEXT: v_mov_b32_e32 v4, s6
; GCN-O0-NEXT: v_mov_b32_e32 v5, s7
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_branch .LBB5_6
; GCN-O0-NEXT: .LBB5_5: ; %Flow2
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 10
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 11
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_branch .LBB5_7
; GCN-O0-NEXT: .LBB5_6: ; %Flow
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 12
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 13
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_branch .LBB5_5
; GCN-O0-NEXT: .LBB5_7: ; %bb10
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: v_readlane_b32 s6, v1, 8
; GCN-O0-NEXT: v_readlane_b32 s7, v1, 9
; GCN-O0-NEXT: s_mov_b64 s[4:5], -1
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 14
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 15
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 16
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 17
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execz .LBB5_9
; GCN-O0-NEXT: ; %bb.8: ; %Flow1
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_mov_b64 s[4:5], 0
; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1
; GCN-O0-NEXT: v_writelane_b32 v1, s4, 14
; GCN-O0-NEXT: v_writelane_b32 v1, s5, 15
; GCN-O0-NEXT: .LBB5_9: ; %Flow3
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_readlane_b32 s8, v1, 16
; GCN-O0-NEXT: v_readlane_b32 s9, v1, 17
; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-O0-NEXT: v_readlane_b32 s6, v1, 4
; GCN-O0-NEXT: v_readlane_b32 s7, v1, 5
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 14
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 15
; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5]
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0
; GCN-O0-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v1, s8, 0
; GCN-O0-NEXT: v_writelane_b32 v1, s9, 1
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 3
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v1, s6, 18
; GCN-O0-NEXT: v_writelane_b32 v1, s7, 19
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
; GCN-O0-NEXT: ; %bb.10: ; %bb12
; GCN-O0-NEXT: v_readlane_b32 s4, v1, 18
; GCN-O0-NEXT: v_readlane_b32 s5, v1, 19
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: ; %bb.11: ; %bb12
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v5
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v4
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v3
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v2
; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
bb:
br label %bb1
bb1: ; preds = %Flow1, %bb1, %bb
%tmp = icmp slt i32 %arg, 519
br i1 %tmp, label %bb2, label %bb1
bb2: ; preds = %bb1
%tmp3 = icmp eq i32 %arg, 0
br i1 %tmp3, label %bb4, label %bb10
bb4: ; preds = %bb2
%tmp6 = load float, ptr addrspace(5) undef
%tmp7 = fcmp olt float %tmp6, 0.0
br i1 %tmp7, label %bb8, label %Flow
bb8: ; preds = %bb4
%tmp9 = insertelement <4 x float> undef, float 0.0, i32 1
br label %Flow
Flow: ; preds = %bb8, %bb4
%tmp8 = phi <4 x float> [ %tmp9, %bb8 ], [ zeroinitializer, %bb4 ]
br label %bb10
bb10: ; preds = %Flow, %bb2
%tmp11 = phi <4 x float> [ zeroinitializer, %bb2 ], [ %tmp8, %Flow ]
br i1 %tmp3, label %bb12, label %Flow1
Flow1: ; preds = %bb10
br label %bb1
bb12: ; preds = %bb10
store volatile <4 x float> %tmp11, ptr addrspace(5) undef, align 16
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare void @llvm.amdgcn.s.barrier() #1
attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind convergent }
attributes #2 = { nounwind }