AMDGPU: Rewrite SILowerI1Copies to always stay on SALU

Summary:
Instead of writing boolean values temporarily into 32-bit VGPRs
if they are involved in PHIs or are observed from outside a loop,
we use bitwise masking operations to combine lane masks in a way
that is consistent with wave control flow.

Move SIFixSGPRCopies to before this pass, since that pass
incorrectly attempts to move SGPR phis to VGPRs.

This should recover most of the code quality that was lost with
the bug fix in "AMDGPU: Remove PHI loop condition optimization".

There are still some relevant cases where code quality could be
improved, in particular:

- We often introduce redundant masks with EXEC. Ideally, we'd
  have a generic computeKnownBits-like analysis to determine
  whether masks are already masked by EXEC, so we can avoid this
  masking both here and when lowering uniform control flow.

- The criterion we use to determine whether a def is observed
  from outside a loop is conservative: it doesn't check whether
  (loop) branch conditions are uniform.

Change-Id: Ibabdb373a7510e426b90deef00f5e16c5d56e64b

Reviewers: arsenm, rampitec, tpr

Subscribers: kzhuravl, jvesely, wdng, mgorny, yaxunl, dstuttard, t-tye, eraman, llvm-commits

Differential Revision: https://reviews.llvm.org/D53496

llvm-svn: 345719
diff --git a/llvm/test/CodeGen/AMDGPU/add_i1.ll b/llvm/test/CodeGen/AMDGPU/add_i1.ll
index fb3b69c..c5f7e3a 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i1.ll
@@ -21,8 +21,8 @@
 }
 
 ; GCN-LABEL: {{^}}add_i1_cf:
-; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}
-; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+; GCN: ; %endif
+; GCN: s_not_b64
 define amdgpu_kernel void @add_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
index 84a2d3d..ae78a1e 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -1,19 +1,25 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}i1_copy_from_loop:
 ;
-; Cannot use an SGPR mask to copy %cc out of the loop, since the mask would
-; only contain the lanes that were active during the last loop iteration.
-;
 ; SI: ; %for.body
-; SI:      v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4,
-; SI:      v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]]
-; SI-NEXT: s_cbranch_vccnz [[ENDIF:BB[0-9_]+]]
-; SI:      [[ENDIF]]:
-; SI-NOT:  [[VREG]]
-; SI:      ; %for.end
-; SI:      v_cmp_ne_u32_e32 vcc, 0, [[VREG]]
+; SI:      v_cmp_gt_u32_e64  [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4,
+; SI-DAG:  s_andn2_b64       [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI-DAG:  s_and_b64         [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec
+; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]]
+
+; SI: ; %Flow1
+; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], exec
+
+; SI: ; %Flow
+; SI-DAG:  s_andn2_b64       [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
+; SI-DAG:  s_and_b64         [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI:      s_or_b64          [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
+
+; SI: ; %for.end
+; SI:      s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]]
+
 define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
 entry:
   br label %for.body
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
new file mode 100644
index 0000000..0aacbbf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_dont_clobber_scc:
+
+; GCN: ; %entry
+; GCN:      s_cmp_eq_u32    s0, 0
+; GCN:      s_cbranch_scc1  [[PREEXIT:BB[0-9_]+]]
+
+; GCN: ; %blocka
+; GCN:      s_xor_b64       s[{{[0-9:]+}}], exec, -1
+; GCN:      s_cmp_eq_u32    s1, 0
+; GCN:      s_cbranch_scc1  [[EXIT:BB[0-9_]+]]
+
+; GCN: [[PREEXIT]]:
+; GCN: [[EXIT]]:
+
+define amdgpu_vs float @test_dont_clobber_scc(i32 inreg %uni, i32 inreg %uni2) #0 {
+entry:
+  %cc.uni = icmp eq i32 %uni, 0
+  br i1 %cc.uni, label %exit, label %blocka
+
+blocka:
+  call void asm sideeffect "; dummy a", ""()
+  %cc.uni2 = icmp eq i32 %uni2, 0
+  br i1 %cc.uni2, label %exit, label %blockb
+
+blockb:
+  call void asm sideeffect "; dummy b", ""()
+  br label %exit
+
+exit:
+  %cc.phi = phi i1 [ true, %entry ], [ false, %blocka ], [ false, %blockb ]
+  call void asm sideeffect "; dummy exit", ""()
+  %r = select i1 %cc.phi, float 1.0, float 2.0
+  ret float %r
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 63a9f1f..5b25271 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -2,12 +2,16 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}br_i1_phi:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; SI: s_and_saveexec_b64
-; SI: v_mov_b32_e32 [[REG]], -1{{$}}
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]]
-; SI: s_and_saveexec_b64
-; SI: s_endpgm
+
+; SI: ; %bb
+; SI:    s_mov_b64           [[TMP:s\[[0-9]+:[0-9]+\]]], 0
+
+; SI: ; %bb2
+; SI:    s_mov_b64           [[TMP]], exec
+
+; SI: ; %bb3
+; SI:    s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[TMP]]
+
 define amdgpu_kernel void @br_i1_phi(i32 %arg) {
 bb:
   %tidig = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index a0563cd..9615efa 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -198,7 +198,8 @@
 }
 
 ; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:
-; CHECK: v_mov_b32_e32 v0, -1{{$}}
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], -1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, [[MASK]]
 ; CHECK: ; use v0
 define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
 entry:
@@ -212,10 +213,14 @@
 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]]
 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK: ; use v0
+; CHECK: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK: v_cndmask_b32_e64 [[STORE:v[0-9]+]], 0, 1, vcc
+; CHECK: {{buffer|flat}}_store_byte [[STORE]],
 define amdgpu_kernel void @i1_input_phys_vgpr() {
 entry:
   %val = load i1, i1 addrspace(1)* undef
-  call void asm sideeffect "; use $0 ", "{v0}"(i1 %val)
+  %cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val)
+  store i1 %cc, i1 addrspace(1)* undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 34b842d..63c1556 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 ; FIXME: Enable for VI.
 
@@ -144,20 +144,24 @@
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
-; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
-; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
 
-; SI: buffer_load_dword [[LOAD:v[0-9]+]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: ; %entry
+; SI:     v_cmp_eq_u32_e64   [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
+; SI:     s_mov_b64          vcc, 0
+; SI:     s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]]
 
+; SI: ; %bb
+; SI:     buffer_load_dword  [[LOAD:v[0-9]+]],
+; SI:     v_cmp_ne_u32_e32   vcc, 0, [[LOAD]]
+; SI:     s_and_b64          vcc, vcc, exec
 
-; SI: BB9_2:
-; SI: s_or_b64 exec, exec, [[SAVE]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: buffer_store_dword
-; SI: s_endpgm
+; SI: ; %exit
+; SI:     s_or_b64           exec, exec, [[SAVE]]
+; SI-NOT: vcc
+; SI:     v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI:     buffer_store_dword
+; SI:     s_endpgm
+
 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
index 5769501..f37b3a3 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -22,23 +22,28 @@
 ; OPT: bb9:
 ; OPT: call void @llvm.amdgcn.end.cf(i64
 
-; TODO: Can remove exec fixes in return block
 ; GCN-LABEL: {{^}}break_loop:
-; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN:      s_mov_b64         [[OUTER_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
-; GCN: v_cmp_lt_i32_e32 vcc, -1
-; GCN: s_and_b64 vcc, exec, vcc
-; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
+; GCN:      v_cmp_lt_i32_e32  vcc, -1
+; GCN:      s_and_b64         vcc, exec, vcc
+; GCN:      s_or_b64          [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec
+; GCN:      s_cbranch_vccnz   [[FLOW:BB[0-9]+_[0-9]+]]
 
-; GCN: ; %bb.2: ; %bb4
-; GCN: buffer_load_dword
-; GCN: v_cmp_ge_i32_e32 vcc,
+; GCN: ; %bb4
+; GCN:      buffer_load_dword
+; GCN:      v_cmp_ge_i32_e32  vcc,
+; GCN:      s_andn2_b64       [[INNER_MASK]], [[INNER_MASK]], exec
+; GCN:      s_and_b64         [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN:      s_or_b64          [[INNER_MASK]], [[INNER_MASK]], [[TMP0]]
 
-; GCN: [[FLOW]]:
-; GCN: s_or_b64 [[MASK]], vcc, [[MASK]]
-; GCN: s_andn2_b64 exec, exec, [[MASK]]
-; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]
+; GCN: [[FLOW]]: ; %Flow
+; GCN:      s_and_b64         [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]]
+; GCN:      s_or_b64          [[TMP1]], [[TMP1]], [[OUTER_MASK]]
+; GCN:      s_mov_b64         [[OUTER_MASK]], [[TMP1]]
+; GCN:      s_andn2_b64       exec, exec, [[TMP1]]
+; GCN-NEXT: s_cbranch_execnz  [[LOOP_ENTRY]]
 
 ; GCN: ; %bb.4: ; %bb9
 ; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index fbdf983..679fd7c 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -59,31 +59,48 @@
 
 
 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
-; GCN: v_cmp_lt_i32_e32 vcc, 1
-; GCN: s_and_saveexec_b64
-; GCN: s_xor_b64
 
+; GCN:      s_mov_b64           [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
+; GCN:      v_cmp_lt_i32_e32    vcc, 1,
+; GCN:      s_mov_b64           [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
+; GCN:      s_and_saveexec_b64
+; GCN:      s_xor_b64
+
+; GCN: ; %LeafBlock1
+; GCN-NEXT: s_mov_b64           [[EXIT0]], exec
+; GCN-NEXT: v_cmp_ne_u32_e32    vcc, 2,
+; GCN-NEXT: s_and_b64           [[EXIT1]], vcc, exec
+
+; GCN: ; %Flow
+; GCN-NEXT: s_or_saveexec_b64
+; GCN-NEXT: s_xor_b64
 
 ; FIXME: Why is this compare essentially repeated?
-; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
-; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: ; %LeafBlock
+; GCN-DAG:  v_cmp_eq_u32_e32    vcc, 1,
+; GCN-DAG:  v_cmp_ne_u32_e64    [[TMP1:s\[[0-9]+:[0-9]+\]]], 1,
+; GCN-DAG:  s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
+; GCN-DAG:  s_andn2_b64         [[EXIT1]], [[EXIT1]], exec
+; GCN-DAG:  s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN-DAG:  s_and_b64           [[TMP1]], [[TMP1]], exec
+; GCN-DAG:  s_or_b64            [[EXIT0]], [[EXIT0]], [[TMP0]]
+; GCN-DAG:  s_or_b64            [[EXIT1]], [[EXIT1]], [[TMP1]]
 
 ; GCN: ; %Flow4
-; GCN-NEXT: s_or_b64 exec, exec
-; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_or_b64            exec, exec,
+; GCN-NEXT: s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]]
+; GCN-NEXT: s_xor_b64
 
 ; GCN: ; %exit1
-; GCN: ds_write_b32
+; GCN:      ds_write_b32
+; GCN:      s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
 
-; GCN: %Flow5
-; GCN-NEXT: s_or_b64 exec, exec
-; GCN: v_cmp_ne_u32_e32 vcc, 0
-; GCN-NEXT: s_and_saveexec_b64
+; GCN: ; %Flow5
+; GCN-NEXT: s_or_b64            exec, exec,
+; GCN-NEXT; s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]]
 
 ; GCN: ; %exit0
-; GCN: buffer_store_dword
+; GCN:      buffer_store_dword
 
 ; GCN: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index c4e2f1e..4c1a769 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -21,34 +21,46 @@
 
 ; GCN-LABEL: {{^}}multi_else_break:
 
+; GCN: ; %main_body
+; GCN:      s_mov_b64           [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+
 ; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
+; GCN:      s_mov_b64           [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
-; GCN: s_and_saveexec_b64 [[SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN:      s_or_b64            [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]], [[BREAK_OUTER]], exec
+; GCN:      s_or_b64            [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]], [[BREAK_INNER]], exec
+; GCN:      s_and_saveexec_b64  [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
 
-; GCN: BB{{[0-9]+}}_{{[0-9]+}}: ; %Flow{{$}}
-; GCN-NEXT: ; in Loop: Header=[[INNER_LOOP]] Depth=2
+; FIXME: duplicate comparison
+; GCN: ; %ENDIF
+; GCN-DAG:  v_cmp_eq_u32_e32    vcc,
+; GCN-DAG:  v_cmp_ne_u32_e64    [[TMP51NEG:s\[[0-9]+:[0-9]+\]]],
+; GCN-DAG:  s_andn2_b64         [[BREAK_OUTER]], [[BREAK_OUTER]], exec
+; GCN-DAG:  s_andn2_b64         [[BREAK_INNER]], [[BREAK_INNER]], exec
+; GCN-DAG:  s_and_b64           [[TMP_EQ:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN-DAG:  s_and_b64           [[TMP_NE:s\[[0-9]+:[0-9]+\]]], [[TMP51NEG]], exec
+; GCN-DAG:  s_or_b64            [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]]
+; GCN-DAG:  s_or_b64            [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]]
 
-; Ensure extra or eliminated
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]]
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], vcc, s{{\[[0-9]+:[0-9]+\]}}
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
-; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]
+; GCN: ; %Flow
+; GCN:      s_or_b64            exec, exec, [[SAVE_EXEC]]
+; GCN:      s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER]]
+; GCN:      s_or_b64            [[TMP0]], [[TMP0]], [[LEFT_INNER]]
+; GCN:      s_mov_b64           [[LEFT_INNER]], [[TMP0]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP0]]
+; GCN:      s_cbranch_execnz    [[INNER_LOOP]]
 
-; GCN: ; %bb.{{[0-9]+}}: ; %Flow2{{$}}
-; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1
+; GCN: ; %Flow2
+; GCN:      s_or_b64            exec, exec, [[TMP0]]
+; GCN:      s_and_b64           [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER]]
+; GCN:      s_or_b64            [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
+; GCN:      s_mov_b64           [[LEFT_OUTER]], [[TMP1]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP1]]
+; GCN:      s_cbranch_execnz    [[OUTER_LOOP]]
 
-; Ensure copy is eliminated
-; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]]
-; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, vcc
-; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
-; GCN-NEXT: s_mov_b64
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]]
-; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]]
+; GCN: ; %IF
+; GCN-NEXT: s_endpgm
 define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 main_body:
   br label %LOOP.outer
@@ -78,12 +90,38 @@
 ; OPT: llvm.amdgcn.end.cf
 
 ; GCN-LABEL: {{^}}multi_if_break_loop:
-; GCN: s_mov_b64 [[BREAK_REG:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN:      s_mov_b64          [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
+; GCN:      s_mov_b64          [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
 
-; GCN: s_or_b64 [[BREAK_REG]], vcc, [[BREAK_REG]]
-; GCN: s_andn2_b64 exec, exec, [[BREAK_REG]]
+; GCN: ; %LeafBlock1
+; GCN:      s_mov_b64
+; GCN:      s_mov_b64          [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+
+; GCN: ; %case1
+; GCN:      buffer_load_dword  [[LOAD2:v[0-9]+]],
+; GCN:      v_cmp_ge_i32_e32   vcc, {{v[0-9]+}}, [[LOAD2]]
+; GCN:      s_orn2_b64         [[BREAK]], vcc, exec
+
+; GCN: ; %Flow3
+; GCN:      s_branch           [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN:      s_mov_b64          [[BREAK]], -1{{$}}
+
+; GCN: [[FLOW]]: ; %Flow
+
+; GCN: ; %case0
+; GCN:      buffer_load_dword  [[LOAD1:v[0-9]+]],
+; GCN-DAG:  s_andn2_b64        [[BREAK]], [[BREAK]], exec
+; GCN-DAG:  v_cmp_ge_i32_e32   vcc, {{v[0-9]+}}, [[LOAD1]]
+; GCN-DAG:  s_and_b64          [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN:      s_or_b64           [[BREAK]], [[BREAK]], [[TMP]]
+
+; GCN: ; %Flow4
+; GCN:      s_and_b64          [[BREAK]], exec, [[BREAK]]
+; GCN:      s_or_b64           [[LEFT]], [[BREAK]], [[OLD_LEFT]]
+; GCN:      s_andn2_b64        exec, exec, [[LEFT]]
 ; GCN-NEXT: s_cbranch_execnz
 
 define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index 33028f1..f773357 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -137,7 +137,6 @@
 ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0
 ; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
 ; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}}
-; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
 define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 04df33b..3db6fd2 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -100,22 +100,22 @@
   ret void
 }
 
-; FIXME: Should write to different SGPR pairs instead of copying to
-; VALU for i1 phi.
-
 ; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
 
-; SI: BB{{[0-9]+}}_2:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
+; SI: ; %else
+; SI:      buffer_load_dword  [[AVAL:v[0-9]+]]
+; SI:      v_cmp_gt_i32_e64   [[PHI:s\[[0-9]+:[0-9]+\]]], 0, [[AVAL]]
 
-; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
-; SI: buffer_store_dword [[RESULT]]
+; SI: ; %if
+; SI:      buffer_load_dword  [[AVAL:v[0-9]+]]
+; SI:      v_cmp_eq_u32_e32   [[CMP_ELSE:vcc]], 0, [[AVAL]]
+; SI-DAG:  s_andn2_b64        [[PHI]], [[PHI]], exec
+; SI-DAG:  s_and_b64          [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP_ELSE]], exec
+; SI:      s_or_b64           [[PHI]], [[PHI]], [[TMP]]
+
+; SI: ; %endif
+; SI:      v_cndmask_b32_e64  [[RESULT:v[0-9]+]], 0, -1, [[PHI]]
+; SI:      buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 73e5659..6215a48 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
 
@@ -27,18 +27,23 @@
 
 
 ; FUNC-LABEL: {{^}}phi_cond_outside_loop:
-; FIXME: This could be folded into the s_or_b64 instruction
-; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0
-; SI: [[LOOP_LABEL:[A-Z0-9]+]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
-; SI_IF_BREAK instruction:
-; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]]
+; SI:     s_mov_b64         [[LEFT:s\[[0-9]+:[0-9]+\]]], 0
+; SI:     s_mov_b64         [[PHI:s\[[0-9]+:[0-9]+\]]], 0
 
-; SI_LOOP instruction:
-; SI: s_andn2_b64 exec, exec, [[BREAK]]
-; SI: s_cbranch_execnz [[LOOP_LABEL]]
-; SI: s_endpgm
+; SI: ; %else
+; SI:     v_cmp_eq_u32_e64  [[TMP:s\[[0-9]+:[0-9]+\]]],
+; SI:     s_and_b64         [[PHI]], [[TMP]], exec
+
+; SI: ; %endif
+
+; SI: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]: ; %loop
+; SI:     s_mov_b64         [[TMP:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+; SI:     s_and_b64         [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[PHI]]
+; SI:     s_or_b64          [[LEFT]], [[TMP1]], [[TMP]]
+; SI:     s_andn2_b64       exec, exec, [[LEFT]]
+; SI:     s_cbranch_execnz  [[LOOP_LABEL]]
+; SI:     s_endpgm
 
 define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
 entry:
@@ -90,19 +95,21 @@
 ; This broke the old AMDIL cfg structurizer
 ; FUNC-LABEL: {{^}}loop_land_info_assert:
 ; SI:      v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
-; SI:      s_and_b64 vcc, exec, [[CMP4]]
-; SI-NEXT: s_cbranch_vccnz [[BR1:BB[0-9_]+]]
-; SI-NEXT: s_branch [[BR2:BB[0-9_]+]]
-; SI-NEXT: BB{{[0-9_]+}}:
-; SI-NEXT: buffer_store_dword
+; SI:      s_and_b64        [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]]
+; SI:      s_mov_b64        vcc, [[CMP4M]]
+; SI-NEXT: s_cbranch_vccnz  [[CONVEX_EXIT:BB[0-9_]+]]
+; SI-NEXT: s_branch         [[FOR_COND_PREHDR:BB[0-9_]+]]
+
+; SI: ; %if.else
+; SI:      buffer_store_dword
 
 ; SI:      [[INFLOOP:BB[0-9]+_[0-9]+]]:
 
-; SI:      [[BR1]]:
-; SI-NEXT: s_and_b64 vcc, exec,
-; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]
+; SI:      [[CONVEX_EXIT]]:
+; SI:      s_mov_b64        vcc,
+; SI-NEXT: s_cbranch_vccnz  [[ENDPGM:BB[0-9]+_[0-9]+]]
 ; SI:      s_branch [[INFLOOP]]
-; SI-NEXT: [[BR2]]:
+; SI-NEXT: [[FOR_COND_PREHDR]]:
 ; SI:      s_cbranch_vccz [[ENDPGM]]
 
 ; SI:      [[ENDPGM]]:
diff --git a/llvm/test/CodeGen/AMDGPU/sub_i1.ll b/llvm/test/CodeGen/AMDGPU/sub_i1.ll
index 70562a5..6861d32 100644
--- a/llvm/test/CodeGen/AMDGPU/sub_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub_i1.ll
@@ -21,8 +21,8 @@
 }
 
 ; GCN-LABEL: {{^}}sub_i1_cf:
-; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}
-; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+; GCN: ; %endif
+; GCN: s_not_b64
 define amdgpu_kernel void @sub_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index 3d980b7..ca85f0b 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -8,23 +8,22 @@
 
 
 ; waitcnt should be inserted after exec modification
-; SI: v_cmp_lt_i32_e32 vcc, 0,
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
+; SI:      v_cmp_lt_i32_e32 vcc, 0,
+; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
+; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
 ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
 ; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
 
 ; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
-; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
-; SI: v_mov_b32_e32 v{{[0-9]}}, -1
-; SI: s_and_saveexec_b64
+; SI:      s_mov_b64 s[{{[0-9]:[0-9]}}], -1
+; SI:      s_and_saveexec_b64
 ; SI-NEXT: ; mask branch
 
 ; v_mov should be after exec modification
 ; SI: [[FLOW_BB]]:
 ; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
-; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
 ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
 ; SI-NEXT: ; mask branch
 ;
@@ -220,9 +219,10 @@
 ; SI: [[LABEL_FLOW]]:
 ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
 ; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
-; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
+; SI-NEXT: s_or_b64 [[TMP2:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[COND_STATE]]
+; SI-NEXT: s_mov_b64 [[COND_STATE]], [[TMP2]]
+; SI-NEXT: s_andn2_b64 exec, exec, [[TMP2]]
 ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
 
 ; SI: [[LABEL_EXIT]]:
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
index a941e5f..08267b7 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}testKernel
 ; GCN: BB0_1:
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cmp_eq_f32_e64
+; GCN-NEXT: v_cmp_eq_f32_e32
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT: v_cmp_eq_f32_e32
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)