[StructurizeCFG] Clean up some boolean not instructions
In some cases StructurizeCFG inserts i1 xor instructions to invert
predicates. Add a quick loop to clean these up afterwards if we can get
away with modifying an existing compare instruction instead.
(StructurizeCFG is generally run late in the pipeline so instcombine
does not clean them up for us.)
Differential Revision: https://reviews.llvm.org/D118623
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 3e8d1ec..746f259 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -139,8 +139,10 @@
; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB4_4
-; CHECK-NEXT: ; %bb.1: ; %bb2
+; CHECK-NEXT: s_cbranch_scc0 .LBB4_2
+; CHECK-NEXT: .LBB4_1: ; %bb12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+; CHECK-NEXT: .LBB4_2: ; %bb2
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, const.ptr@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, const.ptr@gotpcrel32@hi+12
@@ -153,15 +155,13 @@
; CHECK-NEXT: s_mov_b32 s4, -1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0
-; CHECK-NEXT: s_cbranch_vccnz .LBB4_3
-; CHECK-NEXT: ; %bb.2: ; %bb7
+; CHECK-NEXT: s_cbranch_vccnz .LBB4_4
+; CHECK-NEXT: ; %bb.3: ; %bb7
; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB4_3: ; %bb8
+; CHECK-NEXT: .LBB4_4: ; %bb8
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB4_5
-; CHECK-NEXT: .LBB4_4: ; %bb12
-; CHECK-NEXT: s_setpc_b64 s[30:31]
-; CHECK-NEXT: .LBB4_5: ; %bb11
+; CHECK-NEXT: s_cbranch_scc1 .LBB4_1
+; CHECK-NEXT: ; %bb.5: ; %bb11
; CHECK-NEXT: v_mov_b32_e32 v0, 4.0
; CHECK-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 73416db..0b1105f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -838,7 +838,7 @@
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr10
-; CGP-NEXT: .LBB2_2: ; %Flow2
+; CGP-NEXT: .LBB2_2: ; %Flow1
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
; CGP-NEXT: s_cbranch_execz .LBB2_4
@@ -3118,7 +3118,7 @@
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
; CGP-NEXT: ; implicit-def: $vgpr8
-; CGP-NEXT: .LBB8_2: ; %Flow2
+; CGP-NEXT: .LBB8_2: ; %Flow1
; CGP-NEXT: s_or_saveexec_b64 s[8:9], s[8:9]
; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6
; CGP-NEXT: s_xor_b64 exec, exec, s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 5e60c7c..1e95103 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -824,7 +824,7 @@
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr10
-; CGP-NEXT: .LBB2_2: ; %Flow2
+; CGP-NEXT: .LBB2_2: ; %Flow1
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
; CGP-NEXT: s_cbranch_execz .LBB2_4
@@ -3072,7 +3072,7 @@
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
; CGP-NEXT: ; implicit-def: $vgpr8
-; CGP-NEXT: .LBB8_2: ; %Flow2
+; CGP-NEXT: .LBB8_2: ; %Flow1
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[8:9]
; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index bf3c080..2ba189c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -759,7 +759,7 @@
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr10
-; CGP-NEXT: .LBB2_2: ; %Flow2
+; CGP-NEXT: .LBB2_2: ; %Flow1
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
; CGP-NEXT: s_cbranch_execz .LBB2_4
@@ -1641,7 +1641,7 @@
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
; CGP-NEXT: ; implicit-def: $vgpr8
-; CGP-NEXT: .LBB8_2: ; %Flow2
+; CGP-NEXT: .LBB8_2: ; %Flow1
; CGP-NEXT: s_or_saveexec_b64 s[8:9], s[8:9]
; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6
; CGP-NEXT: s_xor_b64 exec, exec, s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 97806c5..e616322 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -750,7 +750,7 @@
; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr10
-; CGP-NEXT: .LBB2_2: ; %Flow2
+; CGP-NEXT: .LBB2_2: ; %Flow1
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
; CGP-NEXT: s_cbranch_execz .LBB2_4
@@ -2181,7 +2181,7 @@
; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
; CGP-NEXT: ; implicit-def: $vgpr8
-; CGP-NEXT: .LBB8_2: ; %Flow2
+; CGP-NEXT: .LBB8_2: ; %Flow1
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[8:9]
; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index c8c8911..c1cb51e 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -227,30 +227,31 @@
; GCN-LABEL: {{^}}uniform_unconditional_min_long_forward_branch:
; GCN: s_cmp_eq_u32
-; GCN: s_cbranch_scc{{[0-1]}} [[BB2:.LBB[0-9]+_[0-9]+]]
+; GCN: s_cbranch_scc{{[0-1]}} [[BB1:.LBB[0-9]+_[0-9]+]]
; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %bb0
; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
-; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], ([[BB3:.LBB[0-9]_[0-9]+]]-[[POST_GETPC]])&4294967295
-; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], ([[BB3:.LBB[0-9]_[0-9]+]]-[[POST_GETPC]])>>32
+; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], ([[BB4:.LBB[0-9]_[0-9]+]]-[[POST_GETPC]])&4294967295
+; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], ([[BB4]]-[[POST_GETPC]])>>32
; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC0_LO]]:[[PC0_HI]]{{\]}}
-; GCN: [[BB2]]: ; %bb3
-; GCN: v_nop_e64
-; GCN: v_nop_e64
-; GCN: v_nop_e64
-; GCN: v_nop_e64
-; GCN: ;;#ASMEND
-
-; GCN: [[BB3]]:
+; GCN: [[BB1]]:
; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
; GCN: buffer_store_dword [[BB2_K]]
; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63
; GCN: buffer_store_dword [[BB4_K]]
; GCN: s_endpgm
-; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
+
+; GCN: [[BB4]]: ; %bb3
+; GCN: v_nop_e64
+; GCN: v_nop_e64
+; GCN: v_nop_e64
+; GCN: v_nop_e64
+; GCN: ;;#ASMEND
+
+; GCN: .Lfunc_end{{[0-9]+}}:
define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
bb0:
%tmp = icmp ne i32 %arg1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 308bc49..e4a1177 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1502,7 +1502,7 @@
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s5, s4, 16
; SI-NEXT: s_cmp_lg_u32 s5, 0
-; SI-NEXT: s_cbranch_scc0 .LBB14_2
+; SI-NEXT: s_cbranch_scc0 .LBB14_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
@@ -1510,22 +1510,22 @@
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
; SI-NEXT: s_mov_b64 s[2:3], 0
-; SI-NEXT: s_cbranch_execz .LBB14_3
-; SI-NEXT: s_branch .LBB14_4
-; SI-NEXT: .LBB14_2:
-; SI-NEXT: s_mov_b64 s[2:3], -1
-; SI-NEXT: v_mov_b32_e32 v0, 0
-; SI-NEXT: .LBB14_3: ; %if
+; SI-NEXT: s_cbranch_execnz .LBB14_3
+; SI-NEXT: .LBB14_2: ; %if
; SI-NEXT: s_and_b32 s2, s4, 0xffff
; SI-NEXT: s_bcnt1_i32_b32 s2, s2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: .LBB14_4: ; %endif
+; SI-NEXT: .LBB14_3: ; %endif
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB14_4:
+; SI-NEXT: s_mov_b64 s[2:3], -1
+; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_branch .LBB14_2
;
; VI-LABEL: ctpop_i16_in_br:
; VI: ; %bb.0: ; %entry
@@ -1535,7 +1535,7 @@
; VI-NEXT: s_lshr_b32 s5, s4, 16
; VI-NEXT: v_cmp_ne_u16_e64 s[6:7], s5, 0
; VI-NEXT: s_and_b64 vcc, exec, s[6:7]
-; VI-NEXT: s_cbranch_vccz .LBB14_2
+; VI-NEXT: s_cbranch_vccz .LBB14_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
@@ -1543,22 +1543,22 @@
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
; VI-NEXT: s_mov_b64 s[2:3], 0
-; VI-NEXT: s_cbranch_execz .LBB14_3
-; VI-NEXT: s_branch .LBB14_4
-; VI-NEXT: .LBB14_2:
-; VI-NEXT: s_mov_b64 s[2:3], -1
-; VI-NEXT: ; implicit-def: $vgpr0
-; VI-NEXT: .LBB14_3: ; %if
+; VI-NEXT: s_cbranch_execnz .LBB14_3
+; VI-NEXT: .LBB14_2: ; %if
; VI-NEXT: s_and_b32 s2, s4, 0xffff
; VI-NEXT: s_bcnt1_i32_b32 s2, s2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: .LBB14_4: ; %endif
+; VI-NEXT: .LBB14_3: ; %endif
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+; VI-NEXT: .LBB14_4:
+; VI-NEXT: s_mov_b64 s[2:3], -1
+; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: s_branch .LBB14_2
;
; EG-LABEL: ctpop_i16_in_br:
; EG: ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 89d319d..4de859f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1534,19 +1534,17 @@
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s6, 0
-; SI-NEXT: s_cbranch_scc0 .LBB30_2
+; SI-NEXT: s_cbranch_scc0 .LBB30_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_load_dword s7, s[2:3], 0x1
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccz .LBB30_3
-; SI-NEXT: s_branch .LBB30_4
-; SI-NEXT: .LBB30_2:
-; SI-NEXT: .LBB30_3: ; %if
+; SI-NEXT: s_cbranch_vccnz .LBB30_3
+; SI-NEXT: .LBB30_2: ; %if
; SI-NEXT: s_load_dword s7, s[2:3], 0x0
-; SI-NEXT: .LBB30_4: ; %endif
+; SI-NEXT: .LBB30_3: ; %endif
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_mov_b32 s3, 0x100f000
@@ -1554,6 +1552,8 @@
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB30_4:
+; SI-NEXT: s_branch .LBB30_2
;
; VI-LABEL: insert_split_bb:
; VI: ; %bb.0: ; %entry
@@ -1561,16 +1561,14 @@
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s6, 0
-; VI-NEXT: s_cbranch_scc0 .LBB30_2
+; VI-NEXT: s_cbranch_scc0 .LBB30_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_load_dword s7, s[2:3], 0x4
-; VI-NEXT: s_cbranch_execz .LBB30_3
-; VI-NEXT: s_branch .LBB30_4
-; VI-NEXT: .LBB30_2:
-; VI-NEXT: .LBB30_3: ; %if
+; VI-NEXT: s_cbranch_execnz .LBB30_3
+; VI-NEXT: .LBB30_2: ; %if
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s7, s[2:3], 0x0
-; VI-NEXT: .LBB30_4: ; %endif
+; VI-NEXT: .LBB30_3: ; %endif
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_mov_b32 s3, 0x1100f000
@@ -1578,6 +1576,8 @@
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+; VI-NEXT: .LBB30_4:
+; VI-NEXT: s_branch .LBB30_2
entry:
%0 = insertelement <2 x i32> undef, i32 %a, i32 0
%1 = icmp eq i32 %a, 0
diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
index ea6493c..8cccb5e 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -17,11 +17,10 @@
; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
; OPT: bb4:
; OPT-NEXT: [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
-; OPT-NEXT: [[CMP1:%.*]] = icmp slt i32 [[MY_TMP]], [[LOAD]]
-; OPT-NEXT: [[TMP0:%.*]] = xor i1 [[CMP1]], true
+; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
; OPT-NEXT: br label [[FLOW]]
; OPT: Flow:
-; OPT-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[BB4]] ], [ true, [[BB1]] ]
+; OPT-NEXT: [[TMP1:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ true, [[BB1]] ]
; OPT-NEXT: [[TMP2]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP1]], i64 [[PHI_BROKEN]])
; OPT-NEXT: [[TMP3:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP2]])
; OPT-NEXT: br i1 [[TMP3]], label [[BB9:%.*]], label [[BB1]]
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index ff22712..64505bc 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -9,14 +9,14 @@
; StructurizeCFG.
; IR-LABEL: @multi_divergent_region_exit_ret_ret(
-; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
; IR: %1 = extractvalue { i1, i64 } %0, 0
; IR: %2 = extractvalue { i1, i64 } %0, 1
; IR: br i1 %1, label %LeafBlock1, label %Flow
; IR: Flow:
; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %4 = phi i1 [ %SwitchLeaf2.inv, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
; IR: %6 = extractvalue { i1, i64 } %5, 0
; IR: %7 = extractvalue { i1, i64 } %5, 1
@@ -75,14 +75,13 @@
; GCN-NEXT: s_or_saveexec_b64
; GCN-NEXT: s_xor_b64
-; FIXME: Why is this compare essentially repeated?
; GCN: ; %LeafBlock
; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1,
-; GCN-DAG: v_cmp_ne_u32_e64 [[TMP1:s\[[0-9]+:[0-9]+\]]], 1,
+; GCN-DAG: v_cmp_ne_u32_e64 [[INV:s\[[0-9]+:[0-9]+\]]], 1,
; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec
; GCN-DAG: s_andn2_b64 [[EXIT1]], [[EXIT1]], exec
; GCN-DAG: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
-; GCN-DAG: s_and_b64 [[TMP1]], [[TMP1]], exec
+; GCN-DAG: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], [[INV]], exec
; GCN-DAG: s_or_b64 [[EXIT0]], [[EXIT0]], [[TMP0]]
; GCN-DAG: s_or_b64 [[EXIT1]], [[EXIT1]], [[TMP1]]
@@ -141,7 +140,7 @@
}
; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
-; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
@@ -196,24 +195,22 @@
}
; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
-; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
+; IR: %divergent.cond0 = icmp sge i32 %tmp16, 2
; IR: llvm.amdgcn.if
; IR: br i1
; IR: {{^}}Flow:
; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %4 = phi i1 [ %uniform.cond0.inv, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %uniform.cond0, %LeafBlock1 ], [ false, %entry ]
; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
; IR: br i1 %6, label %LeafBlock, label %Flow1
; IR: {{^}}LeafBlock:
; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
-; IR: %divergent.cond1.inv = xor i1 %divergent.cond1, true
; IR: br label %Flow1
; IR: LeafBlock1:
-; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
-; IR: %uniform.cond0.inv = xor i1 %uniform.cond0, true
+; IR: %uniform.cond0 = icmp ne i32 %arg3, 2
; IR: br label %Flow
; IR: Flow2:
@@ -279,12 +276,12 @@
}
; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
-; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
; IR: br i1 %1, label %LeafBlock1, label %Flow
; IR: Flow:
; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %4 = phi i1 [ %SwitchLeaf2.inv, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
@@ -401,11 +398,11 @@
}
; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
-; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
; IR: Flow:
; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %4 = phi i1 [ %SwitchLeaf2.inv, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
; IR: Flow2:
@@ -640,7 +637,7 @@
; IR: br i1 %6, label %uniform.if, label %Flow2
; IR: Flow: ; preds = %uniform.then, %uniform.if
-; IR: %7 = phi i1 [ %uniform.cond2.inv, %uniform.then ], [ %uniform.cond1.inv, %uniform.if ]
+; IR: %7 = phi i1 [ %uniform.cond2, %uniform.then ], [ %uniform.cond1.inv, %uniform.if ]
; IR: br i1 %7, label %uniform.endif, label %uniform.ret0
; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index ee90406..5ec757a 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -123,14 +123,13 @@
; OPT-NEXT: [[LOAD0:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
; OPT-NEXT: br label [[NODEBLOCK:%.*]]
; OPT: NodeBlock:
-; OPT-NEXT: [[PIVOT:%.*]] = icmp slt i32 [[LOAD0]], 1
-; OPT-NEXT: [[PIVOT_INV:%.*]] = xor i1 [[PIVOT]], true
-; OPT-NEXT: br i1 [[PIVOT_INV]], label [[LEAFBLOCK1:%.*]], label [[FLOW:%.*]]
+; OPT-NEXT: [[PIVOT:%.*]] = icmp sge i32 [[LOAD0]], 1
+; OPT-NEXT: br i1 [[PIVOT]], label [[LEAFBLOCK1:%.*]], label [[FLOW:%.*]]
; OPT: LeafBlock1:
; OPT-NEXT: [[SWITCHLEAF2:%.*]] = icmp eq i32 [[LOAD0]], 1
; OPT-NEXT: br i1 [[SWITCHLEAF2]], label [[CASE1:%.*]], label [[FLOW3:%.*]]
; OPT: Flow3:
-; OPT-NEXT: [[TMP0:%.*]] = phi i1 [ [[CMP2_INV:%.*]], [[CASE1]] ], [ true, [[LEAFBLOCK1]] ]
+; OPT-NEXT: [[TMP0:%.*]] = phi i1 [ [[CMP2:%.*]], [[CASE1]] ], [ true, [[LEAFBLOCK1]] ]
; OPT-NEXT: [[TMP1:%.*]] = phi i1 [ false, [[CASE1]] ], [ true, [[LEAFBLOCK1]] ]
; OPT-NEXT: br label [[FLOW]]
; OPT: LeafBlock:
@@ -144,8 +143,7 @@
; OPT-NEXT: br i1 [[TMP5]], label [[FLOW6:%.*]], label [[BB1]]
; OPT: case0:
; OPT-NEXT: [[LOAD1:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
-; OPT-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP]], [[LOAD1]]
-; OPT-NEXT: [[CMP1_INV:%.*]] = xor i1 [[CMP1]], true
+; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[TMP]], [[LOAD1]]
; OPT-NEXT: br label [[FLOW5]]
; OPT: Flow:
; OPT-NEXT: [[TMP6]] = phi i1 [ [[TMP0]], [[FLOW3]] ], [ true, [[NODEBLOCK]] ]
@@ -154,11 +152,10 @@
; OPT-NEXT: br i1 [[TMP8]], label [[LEAFBLOCK:%.*]], label [[FLOW4]]
; OPT: case1:
; OPT-NEXT: [[LOAD2:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
-; OPT-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP]], [[LOAD2]]
-; OPT-NEXT: [[CMP2_INV]] = xor i1 [[CMP2]], true
+; OPT-NEXT: [[CMP2]] = icmp sge i32 [[TMP]], [[LOAD2]]
; OPT-NEXT: br label [[FLOW3]]
; OPT: Flow5:
-; OPT-NEXT: [[TMP9]] = phi i1 [ [[CMP1_INV]], [[CASE0]] ], [ [[TMP6]], [[LEAFBLOCK]] ]
+; OPT-NEXT: [[TMP9]] = phi i1 [ [[CMP1]], [[CASE0]] ], [ [[TMP6]], [[LEAFBLOCK]] ]
; OPT-NEXT: [[TMP10]] = phi i1 [ false, [[CASE0]] ], [ true, [[LEAFBLOCK]] ]
; OPT-NEXT: br label [[FLOW4]]
; OPT: Flow6:
@@ -196,8 +193,8 @@
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; GCN-NEXT: s_mov_b64 s[6:7], -1
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9
; GCN-NEXT: s_mov_b64 s[10:11], -1
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index a1fa2ab..113e346 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -236,8 +236,8 @@
; IR: Flow1:
; IR-NEXT: [[TMP11]] = phi <4 x i32> [ [[MY_TMP9:%.*]], [[BB21:%.*]] ], [ undef, [[BB14]] ]
; IR-NEXT: [[TMP12]] = phi i32 [ [[MY_TMP10:%.*]], [[BB21]] ], [ undef, [[BB14]] ]
-; IR-NEXT: [[TMP13:%.*]] = phi i1 [ [[TMP18:%.*]], [[BB21]] ], [ true, [[BB14]] ]
-; IR-NEXT: [[TMP14]] = phi i1 [ [[TMP18]], [[BB21]] ], [ false, [[BB14]] ]
+; IR-NEXT: [[TMP13:%.*]] = phi i1 [ [[MY_TMP12:%.*]], [[BB21]] ], [ true, [[BB14]] ]
+; IR-NEXT: [[TMP14]] = phi i1 [ [[MY_TMP12]], [[BB21]] ], [ false, [[BB14]] ]
; IR-NEXT: [[TMP15:%.*]] = phi i1 [ false, [[BB21]] ], [ true, [[BB14]] ]
; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]])
; IR-NEXT: [[TMP16]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP13]], i64 [[PHI_BROKEN]])
@@ -262,8 +262,7 @@
; IR-NEXT: [[MY_TMP9]] = load <4 x i32>, <4 x i32> addrspace(1)* [[MY_TMP8]], align 16
; IR-NEXT: [[MY_TMP10]] = extractelement <4 x i32> [[MY_TMP9]], i64 0
; IR-NEXT: [[MY_TMP11:%.*]] = load volatile i32, i32 addrspace(1)* undef
-; IR-NEXT: [[MY_TMP12:%.*]] = icmp slt i32 [[MY_TMP11]], 9
-; IR-NEXT: [[TMP18]] = xor i1 [[MY_TMP12]], true
+; IR-NEXT: [[MY_TMP12]] = icmp sge i32 [[MY_TMP11]], 9
; IR-NEXT: br label [[FLOW1]]
; IR: Flow2:
; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP16]])
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index f12bed6..b4eb682 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -36,19 +36,17 @@
; GCN-LABEL: {{^}}negated_cond_dominated_blocks:
; GCN: s_cmp_lg_u32
-; GCN: s_cselect_b64 [[CC1:[^,]+]], -1, 0
+; GCN: s_cselect_b64 [[CC1:[^,]+]], -1, 0
; GCN: s_branch [[BB1:.LBB[0-9]+_[0-9]+]]
; GCN: [[BB0:.LBB[0-9]+_[0-9]+]]
; GCN-NOT: v_cndmask_b32
; GCN-NOT: v_cmp
; GCN: [[BB1]]:
-; GCN: s_mov_b64 [[CC2:[^,]+]], -1
; GCN: s_mov_b64 vcc, [[CC1]]
; GCN: s_cbranch_vccz [[BB2:.LBB[0-9]+_[0-9]+]]
-; GCN: s_mov_b64 [[CC2]], 0
+; GCN: s_mov_b64 vcc, exec
+; GCN: s_cbranch_execnz [[BB0]]
; GCN: [[BB2]]:
-; GCN: s_andn2_b64 vcc, exec, [[CC2]]
-; GCN: s_cbranch_vccnz [[BB0]]
define amdgpu_kernel void @negated_cond_dominated_blocks(i32 addrspace(1)* %arg1) {
bb:
br label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 6a2b7ca..15384e6 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -16,22 +16,22 @@
; SI-NEXT: s_load_dword s0, s[0:1], 0xf
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s8, 0
-; SI-NEXT: s_cbranch_scc0 .LBB0_2
+; SI-NEXT: s_cbranch_scc0 .LBB0_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_add_i32 s2, s11, s0
-; SI-NEXT: s_cbranch_execz .LBB0_3
-; SI-NEXT: s_branch .LBB0_4
-; SI-NEXT: .LBB0_2:
-; SI-NEXT: ; implicit-def: $sgpr2
-; SI-NEXT: .LBB0_3: ; %if
+; SI-NEXT: s_cbranch_execnz .LBB0_3
+; SI-NEXT: .LBB0_2: ; %if
; SI-NEXT: s_sub_i32 s2, s9, s10
-; SI-NEXT: .LBB0_4: ; %endif
+; SI-NEXT: .LBB0_3: ; %endif
; SI-NEXT: s_add_i32 s0, s2, s8
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB0_4:
+; SI-NEXT: ; implicit-def: $sgpr2
+; SI-NEXT: s_branch .LBB0_2
entry:
%0 = icmp eq i32 %a, 0
@@ -59,28 +59,28 @@
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s6, 0
-; SI-NEXT: s_cbranch_scc0 .LBB1_2
+; SI-NEXT: s_cbranch_scc0 .LBB1_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_load_dword s2, s[0:1], 0x2e
; SI-NEXT: s_load_dword s3, s[0:1], 0x37
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_i32 s7, s2, s3
-; SI-NEXT: s_cbranch_execz .LBB1_3
-; SI-NEXT: s_branch .LBB1_4
-; SI-NEXT: .LBB1_2:
-; SI-NEXT: ; implicit-def: $sgpr7
-; SI-NEXT: .LBB1_3: ; %if
+; SI-NEXT: s_cbranch_execnz .LBB1_3
+; SI-NEXT: .LBB1_2: ; %if
; SI-NEXT: s_load_dword s2, s[0:1], 0x1c
; SI-NEXT: s_load_dword s0, s[0:1], 0x25
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_i32 s7, s2, s0
-; SI-NEXT: .LBB1_4: ; %endif
+; SI-NEXT: .LBB1_3: ; %endif
; SI-NEXT: s_add_i32 s0, s7, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB1_4:
+; SI-NEXT: ; implicit-def: $sgpr7
+; SI-NEXT: s_branch .LBB1_2
entry:
%cmp0 = icmp eq i32 %a, 0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
index 874326d..ef2acafa 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -213,7 +213,7 @@
; CHECK-LABEL: {{^}}sample_v3:
; CHECK: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 5
; CHECK: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 7
-; CHECK: s_branch
+; CHECK: s_cbranch
; CHECK: BB{{[0-9]+_[0-9]+}}:
; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 11
@@ -315,13 +315,15 @@
; CHECK-LABEL:{{^}}sample_rsrc
; CHECK: s_cmp_eq_u32
-; CHECK: s_cbranch_scc0 [[END:.LBB[0-9]+_[0-9]+]]
+; CHECK: s_cbranch_scc1 [[END:.LBB[0-9]+_[0-9]+]]
-; CHECK: v_add_{{[iu]}}32_e32 v[[ADD:[0-9]+]], vcc, 1, v{{[0-9]+}}
+; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
+; CHECK: s_endpgm
; [[END]]:
+; CHECK: v_add_{{[iu]}}32_e32 v[[ADD:[0-9]+]], vcc, 1, v{{[0-9]+}}
; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}}
-; CHECK: s_endpgm
+; CHECK: s_branch
define amdgpu_ps void @sample_rsrc([6 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [16 x <4 x i32>] addrspace(4)* inreg %arg2, [32 x <8 x i32>] addrspace(4)* inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
bb:
%tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg1, i32 0, i32 0