[VPlan] Handle FirstActiveLane when unrolling. (#145394) Currently FirstActiveLane is not handled correctly during unrolling. This is currently causing mis-compiles when vectorizing early-exit loops with interleaving forced. This patch updates handling of FirstActiveLane to be analogous to computing final reduction results: during unrolling, the created copies for its original operand are added as additional operands, and FirstActiveLane will always produce the index of the first active lane across all unrolled iterations. Note that some of the generated code is still incorrect, as we also need to handle ExtractElement with FirstActiveLane operands. I will share patches for those soon as well. PR: https://github.com/llvm/llvm-project/pull/145394

commit: ec62dee70349f6d24cab641343a021b6c2fc679a [log] [tgz]
author: Florian Hahn <flo@fhahn.com> Fri Jun 27 08:44:57 2025 +0100
committer: GitHub <noreply@github.com> Fri Jun 27 08:44:57 2025 +0100
tree: 7e68d07094fd28e866c3c501f6093f697e8b1802
parent: 045b82736702799f7f56da2f98a3fa1f2fae8d46 [diff]
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 82afc63..32e788b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h

@@ -966,7 +966,9 @@
     // all unrolled iterations. Unrolling will add all copies of its original
     // operand as additional operands.
     AnyOf,
-    // Calculates the first active lane index of the vector predicate operand.
+    // Calculates the first active lane index of the vector predicate operands.
+    // It produces the lane index across all unrolled iterations. Unrolling will
+    // add all copies of its original operand as additional operands.
     FirstActiveLane,
 
     // The opcodes below are used for VPInstructionWithType.

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bd09feb..7b30240 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

@@ -856,9 +856,32 @@
     return Builder.CreateOrReduce(Res);
   }
   case VPInstruction::FirstActiveLane: {
-    Value *Mask = State.get(getOperand(0));
-    return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
-                                                true, Name);
+    if (getNumOperands() == 1) {
+      Value *Mask = State.get(getOperand(0));
+      return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
+                                                  true, Name);
+    }
+    // If there are multiple operands, create a chain of selects to pick the
+    // first operand with an active lane and add the number of lanes of the
+    // preceding operands.
+    Value *RuntimeVF =
+        getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
+    unsigned LastOpIdx = getNumOperands() - 1;
+    Value *Res = nullptr;
+    for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
+      Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
+          Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
+      Value *Current = Builder.CreateAdd(
+          Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
+      if (Res) {
+        Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
+        Res = Builder.CreateSelect(Cmp, Current, Res);
+      } else {
+        Res = Current;
+      }
+    }
+
+    return Res;
   }
   default:
     llvm_unreachable("Unsupported opcode for instruction");

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 01ac6a9..4bc9e40 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

@@ -345,10 +345,12 @@
     if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
       continue;
 
-    // Add all VPValues for all parts to AnyOf and Compute*Result which combine
-    // all parts to compute the final value.
+    // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
+    // Compute*Result which combine all parts to compute the final value.
     VPValue *Op1;
     if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) ||
+        match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>(
+                      m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
                       m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
index b5cef4f..551fcd2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll

@@ -77,8 +77,27 @@
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 510, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP42:%.*]] = mul nuw i64 [[TMP63]], 16
+; CHECK-NEXT:    [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP42]], 3
+; CHECK-NEXT:    [[TMP45:%.*]] = add i64 [[TMP62]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
+; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP42]], 2
+; CHECK-NEXT:    [[TMP50:%.*]] = add i64 [[TMP58]], [[TMP46]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp ne i64 [[TMP46]], [[TMP42]]
+; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP47]], i64 [[TMP50]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
+; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[TMP42]], 1
+; CHECK-NEXT:    [[TMP56:%.*]] = add i64 [[TMP64]], [[TMP52]]
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp ne i64 [[TMP52]], [[TMP42]]
+; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP53]], i64 [[TMP56]], i64 [[TMP51]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP11]], i1 true)
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP15]]
+; CHECK-NEXT:    [[TMP65:%.*]] = mul i64 [[TMP42]], 0
+; CHECK-NEXT:    [[TMP60:%.*]] = add i64 [[TMP65]], [[TMP15]]
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp ne i64 [[TMP15]], [[TMP42]]
+; CHECK-NEXT:    [[TMP61:%.*]] = select i1 [[TMP59]], i64 [[TMP60]], i64 [[TMP57]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 3, [[TMP16]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:

diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
index ae7adc6..6b652a0 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll

@@ -3,7 +3,6 @@
 
 declare void @init_mem(ptr, i64);
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4-LABEL: define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4-NEXT:  entry:
@@ -87,7 +86,6 @@
   ret i64 %p2
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4-LABEL: define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:  entry:
@@ -103,19 +101,19 @@
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
-; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
-; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
-; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
-; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
+; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
+; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
@@ -136,7 +134,20 @@
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -185,7 +196,6 @@
   ret i64 %retval
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4-LABEL: define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4-NEXT:  entry:
@@ -224,7 +234,20 @@
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP6:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; VF4IC4-NEXT:    [[TMP29:%.*]] = add i64 12, [[TMP28]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP16]], i1 true)
+; VF4IC4-NEXT:    [[TMP19:%.*]] = add i64 8, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP18:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP20:%.*]] = select i1 [[TMP18]], i64 [[TMP19]], i64 [[TMP29]]
+; VF4IC4-NEXT:    [[TMP21:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 true)
+; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 4, [[TMP21]]
+; VF4IC4-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP21]], 4
+; VF4IC4-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i64 [[TMP23]], i64 [[TMP20]]
+; VF4IC4-NEXT:    [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 0, [[TMP25]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp ne i64 [[TMP25]], 4
+; VF4IC4-NEXT:    [[TMP6:%.*]] = select i1 [[TMP26]], i64 [[TMP27]], i64 [[TMP24]]
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP7]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -266,7 +289,6 @@
   ret ptr %retval
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4-LABEL: define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4-NEXT:  entry:
@@ -282,19 +304,19 @@
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
-; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
-; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
-; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
-; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
+; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
+; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
@@ -315,7 +337,20 @@
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -364,7 +399,6 @@
   ret i64 %retval
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4-LABEL: define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:  entry:
@@ -380,19 +414,19 @@
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
-; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
-; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
-; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
-; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
+; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
+; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
@@ -413,7 +447,20 @@
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
@@ -469,7 +516,6 @@
   ret i64 %retval2
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4-LABEL: define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4-NEXT:  entry:
@@ -485,19 +531,19 @@
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
-; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
-; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
-; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
-; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
+; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
+; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
@@ -518,7 +564,20 @@
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
@@ -574,7 +633,6 @@
   ret i64 %retval2
 }
 
-; FIXME: Computes incorrect final IV when early exit is taken.
 define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-NEXT:  entry:
@@ -592,29 +650,12 @@
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
 ; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4
-; VF4IC4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3
-; VF4IC4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8
-; VF4IC4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 -3
-; VF4IC4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
-; VF4IC4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 -3
-; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
-; VF4IC4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP25]], align 1
-; VF4IC4-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD2]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP27]], align 1
-; VF4IC4-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP29]], align 1
-; VF4IC4-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3
-; VF4IC4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -4
-; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 -3
-; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -8
+; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8
 ; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3
-; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -12
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
 ; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
 ; VF4IC4-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -622,10 +663,27 @@
 ; VF4IC4-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
 ; VF4IC4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE2]]
-; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE3]], [[REVERSE10]]
-; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE4]], [[REVERSE12]]
-; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]]
+; VF4IC4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0
+; VF4IC4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 -3
+; VF4IC4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 -3
+; VF4IC4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -8
+; VF4IC4-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 -3
+; VF4IC4-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -12
+; VF4IC4-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP45]], i32 -3
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP27]], align 1
+; VF4IC4-NEXT:    [[REVERSE8:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD7]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, ptr [[TMP29]], align 1
+; VF4IC4-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD10]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, ptr [[TMP44]], align 1
+; VF4IC4-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD12]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i8>, ptr [[TMP46]], align 1
+; VF4IC4-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD14]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE2]], [[REVERSE8]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE10]], [[REVERSE11]]
+; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE12]], [[REVERSE13]]
+; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE14]], [[REVERSE15]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; VF4IC4-NEXT:    [[TMP22:%.*]] = or <4 x i1> [[TMP6]], [[TMP19]]
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = or <4 x i1> [[TMP22]], [[TMP20]]
@@ -639,7 +697,20 @@
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; VF4IC4-NEXT:    [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
+; VF4IC4-NEXT:    [[TMP42:%.*]] = add i64 12, [[TMP41]]
+; VF4IC4-NEXT:    [[TMP43:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
+; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 8, [[TMP43]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP43]], 4
+; VF4IC4-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP42]]
+; VF4IC4-NEXT:    [[TMP34:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; VF4IC4-NEXT:    [[TMP36:%.*]] = add i64 4, [[TMP34]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne i64 [[TMP34]], 4
+; VF4IC4-NEXT:    [[TMP37:%.*]] = select i1 [[TMP35]], i64 [[TMP36]], i64 [[TMP33]]
+; VF4IC4-NEXT:    [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; VF4IC4-NEXT:    [[TMP40:%.*]] = add i64 0, [[TMP38]]
+; VF4IC4-NEXT:    [[TMP39:%.*]] = icmp ne i64 [[TMP38]], 4
+; VF4IC4-NEXT:    [[TMP10:%.*]] = select i1 [[TMP39]], i64 [[TMP40]], i64 [[TMP37]]
 ; VF4IC4-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
 ; VF4IC4-NEXT:    [[TMP12:%.*]] = sub i64 1023, [[TMP11]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -704,23 +775,23 @@
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
-; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 4
-; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 8
-; VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; VF4IC4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
+; VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP29]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
 ; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
-; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
+; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
 ; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -736,7 +807,20 @@
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE1]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP22:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE8]]
+; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE8]], 4
+; VF4IC4-NEXT:    [[TMP23:%.*]] = select i1 [[TMP21]], i64 [[TMP22]], i64 [[TMP20]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP25:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE9]]
+; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE9]], 4
+; VF4IC4-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i64 [[TMP25]], i64 [[TMP23]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE10]]
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE10]], 4
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP26]]
 ; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[WIDE_LOAD]], i64 [[FIRST_ACTIVE_LANE]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
 ; VF4IC4:       scalar.ph:
@@ -804,52 +888,65 @@
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4
 ; VF4IC4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 -3
 ; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8
-; VF4IC4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3
-; VF4IC4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
-; VF4IC4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 -3
+; VF4IC4-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3
+; VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
+; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; VF4IC4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
-; VF4IC4-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD2]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP25]], align 1
-; VF4IC4-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP27]], align 1
-; VF4IC4-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3
-; VF4IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -4
-; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 -3
-; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -8
-; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3
-; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -12
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
 ; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP37]], align 1
+; VF4IC4-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; VF4IC4-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; VF4IC4-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP38]], i32 -3
+; VF4IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -4
+; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 -3
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -8
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -12
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP39]], align 1
+; VF4IC4-NEXT:    [[REVERSE8:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD7]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
 ; VF4IC4-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
 ; VF4IC4-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
 ; VF4IC4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE2]]
-; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE3]], [[REVERSE10]]
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE8]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE2]], [[REVERSE10]]
 ; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE4]], [[REVERSE12]]
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; VF4IC4-NEXT:    [[TMP22:%.*]] = or <4 x i1> [[TMP6]], [[TMP19]]
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = or <4 x i1> [[TMP22]], [[TMP20]]
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = or <4 x i1> [[TMP23]], [[TMP21]]
-; VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP24]])
-; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
-; VF4IC4-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
-; VF4IC4-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; VF4IC4-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP24]])
+; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
+; VF4IC4-NEXT:    [[TMP27:%.*]] = or i1 [[TMP25]], [[TMP26]]
+; VF4IC4-NEXT:    br i1 [[TMP27]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF4IC4:       middle.split:
-; VF4IC4-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4-NEXT:    br i1 [[TMP25]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE1]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE15]]
+; VF4IC4-NEXT:    [[TMP29:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE15]], 4
+; VF4IC4-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i64 [[TMP30]], i64 [[TMP28]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; VF4IC4-NEXT:    [[TMP33:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE16]]
+; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE16]], 4
+; VF4IC4-NEXT:    [[TMP34:%.*]] = select i1 [[TMP32]], i64 [[TMP33]], i64 [[TMP31]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE17:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; VF4IC4-NEXT:    [[TMP36:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE17]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE17]], 4
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = select i1 [[TMP35]], i64 [[TMP36]], i64 [[TMP34]]
 ; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[REVERSE]], i64 [[FIRST_ACTIVE_LANE]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
 ; VF4IC4:       scalar.ph:

diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index 16be32f..a782274 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll

@@ -206,8 +206,13 @@
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
+; VF8UF2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP6]], i1 true)
+; VF8UF2-NEXT:    [[TMP10:%.*]] = add i64 8, [[TMP8]]
 ; VF8UF2-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 true)
 ; VF8UF2-NEXT:    [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]]
+; VF8UF2-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE]], 8
+; VF8UF2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP5]], i64 [[TMP10]]
+; VF8UF2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
 ; VF8UF2-NEXT:    br label %[[EXIT]]
 ; VF8UF2:       [[SCALAR_PH]]:
 ; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -223,7 +228,7 @@
 ; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
 ; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF8UF2:       [[EXIT]]:
-; VF8UF2-NEXT:    [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP5]], %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF2-NEXT:    [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP12]], %[[VECTOR_EARLY_EXIT]] ]
 ; VF8UF2-NEXT:    ret i64 [[RES]]
 ;
 ; VF16UF1-LABEL: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(
commit	ec62dee70349f6d24cab641343a021b6c2fc679a	[log] [tgz]
author	Florian Hahn <flo@fhahn.com>	Fri Jun 27 08:44:57 2025 +0100
committer	GitHub <noreply@github.com>	Fri Jun 27 08:44:57 2025 +0100
tree	7e68d07094fd28e866c3c501f6093f697e8b1802
parent	045b82736702799f7f56da2f98a3fa1f2fae8d46 [diff]