llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll - llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s

 define dso_local arm_aapcs_vfpcc void @mul_v16i8(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 ; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v16i8(
 ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 15
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -16
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
 ; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 16
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 4 [[TMP]], <16 x i1> [[TMP1]], <16 x i8> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 4 [[TMP3]], <16 x i1> [[TMP1]], <16 x i8> undef)
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[C]], i32 [[INDEX]]
 ; CHECK-NEXT:    tail call void @llvm.masked.store.v16i8.p0(<16 x i8> [[MUL]], ptr align 4 [[TMP6]], <16 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 15
   %tmp9 = lshr i32 %tmp8, 4
   %tmp10 = shl nuw i32 %tmp9, 4
   %tmp11 = add i32 %tmp10, -16
   %tmp12 = lshr i32 %tmp11, 4
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %tmp = getelementptr inbounds i8, ptr %a, i32 %index
   %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
   %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %tmp3 = getelementptr inbounds i8, ptr %b, i32 %index
   %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
   %tmp6 = getelementptr inbounds i8, ptr %c, i32 %index
   tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul, ptr %tmp6, i32 4, <16 x i1> %active.lane.mask)
   %index.next = add i32 %index, 16
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 define dso_local arm_aapcs_vfpcc void @mul_v8i16(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 ; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v8i16(
 ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 7
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 3
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
 ; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 8
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP]], <8 x i1> [[TMP1]], <8 x i16> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP3]], <8 x i1> [[TMP1]], <8 x i16> undef)
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[C]], i32 [[INDEX]]
 ; CHECK-NEXT:    tail call void @llvm.masked.store.v8i16.p0(<8 x i16> [[MUL]], ptr align 4 [[TMP6]], <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 7
   %tmp9 = lshr i32 %tmp8, 3
   %tmp10 = shl nuw i32 %tmp9, 3
   %tmp11 = add i32 %tmp10, -8
   %tmp12 = lshr i32 %tmp11, 3
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %tmp = getelementptr inbounds i16, ptr %a, i32 %index
   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
   %tmp3 = getelementptr inbounds i16, ptr %b, i32 %index
   %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp3, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
   %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load
   %tmp6 = getelementptr inbounds i16, ptr %c, i32 %index
   tail call void @llvm.masked.store.v8i16.p0(<8 x i16> %mul, ptr %tmp6, i32 4, <8 x i1> %active.lane.mask)
   %index.next = add i32 %index, 8
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 define dso_local arm_aapcs_vfpcc void @mul_v4i32(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 ; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v4i32(
 ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -4
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
 ; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
 ; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr align 4 [[TMP6]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
   %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %mul, ptr %tmp6, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 define dso_local arm_aapcs_vfpcc void @split_vector(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 ; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @split_vector(
 ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -4
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
 ; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[EXTRACT_1_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[EXTRACT_1_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[EXTRACT_2_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[EXTRACT_2_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <2 x i32> [[EXTRACT_1_LOW]], [[EXTRACT_2_LOW]]
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <2 x i32> [[EXTRACT_1_HIGH]], [[EXTRACT_2_HIGH]]
 ; CHECK-NEXT:    [[COMBINE:%.*]] = shufflevector <2 x i32> [[MUL]], <2 x i32> [[SUB]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
 ; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[COMBINE]], ptr align 4 [[TMP6]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
   %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
   %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
   %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
   %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low
   %sub = sub nsw <2 x i32> %extract.1.high, %extract.2.high
   %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %combine, ptr %tmp6, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; One of the loads now uses ult predicate.
 define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 ; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(
 ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -4
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
 ; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[WRONG]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
 ; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr align 4 [[TMP6]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %wrong, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; The store now uses ult predicate.
 define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 ; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(
 ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -4
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
 ; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
 ; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr align 4 [[TMP6]], <4 x i1> [[WRONG]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
   %tmp9 = lshr i32 %tmp8, 2
   %tmp10 = shl nuw i32 %tmp9, 2
   %tmp11 = add i32 %tmp10, -4
   %tmp12 = lshr i32 %tmp11, 2
   %tmp13 = add nuw nsw i32 %tmp12, 1
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %wrong)
   %index.next = add i32 %index, 4
   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 ; TODO: Multiple intrinsics not yet supported.
 ; This is currently rejected, because if the vector body is unrolled, the step
 ; is not what we expect:
 ;
 ;   Step value 16 doesn't match vector width 4
 ;
 define dso_local void @interleave4(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: define dso_local void @interleave4(
 ; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
 ; CHECK-NEXT:    [[V0:%.*]] = add i32 [[N]], 15
 ; CHECK-NEXT:    [[V1:%.*]] = lshr i32 [[V0]], 4
 ; CHECK-NEXT:    [[V2:%.*]] = shl nuw i32 [[V1]], 4
 ; CHECK-NEXT:    [[V3:%.*]] = add i32 [[V2]], -16
 ; CHECK-NEXT:    [[V4:%.*]] = lshr i32 [[V3]], 4
 ; CHECK-NEXT:    [[V5:%.*]] = add nuw nsw i32 [[V4]], 1
 ; CHECK-NEXT:    br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, ptr [[A]], i32 8
 ; CHECK-NEXT:    [[SCEVGEP30:%.*]] = getelementptr i32, ptr [[C]], i32 8
 ; CHECK-NEXT:    [[SCEVGEP37:%.*]] = getelementptr i32, ptr [[B]], i32 8
 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[V5]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[LSR_IV38:%.*]] = phi ptr [ [[SCEVGEP39:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP37]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[LSR_IV31:%.*]] = phi ptr [ [[SCEVGEP32:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP30]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP25:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[V14:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[V6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[V15:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[V7:%.*]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK15:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V7]], i32 [[N]])
 ; CHECK-NEXT:    [[V8:%.*]] = add i32 [[V7]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK16:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V8]], i32 [[N]])
 ; CHECK-NEXT:    [[V9:%.*]] = add i32 [[V8]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK17:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V9]], i32 [[N]])
 ; CHECK-NEXT:    [[SCEVGEP42:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -2
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP42]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[SCEVGEP43:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -1
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP43]], <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD19:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV38]], <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef)
 ; CHECK-NEXT:    [[SCEVGEP41:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 1
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD20:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP41]], <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef)
 ; CHECK-NEXT:    [[SCEVGEP34:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -2
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP34]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[SCEVGEP35:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -1
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD22:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP35]], <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV31]], <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef)
 ; CHECK-NEXT:    [[SCEVGEP36:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 1
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD24:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP36]], <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef)
 ; CHECK-NEXT:    [[V10:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD21]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[V11:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_MASKED_LOAD18]]
 ; CHECK-NEXT:    [[V12:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD19]]
 ; CHECK-NEXT:    [[V13:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD24]], [[WIDE_MASKED_LOAD20]]
 ; CHECK-NEXT:    [[SCEVGEP27:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -2
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V10]], ptr align 4 [[SCEVGEP27]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[SCEVGEP28:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -1
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V11]], ptr align 4 [[SCEVGEP28]], <4 x i1> [[ACTIVE_LANE_MASK15]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V12]], ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK16]])
 ; CHECK-NEXT:    [[SCEVGEP29:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 1
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V13]], ptr align 4 [[SCEVGEP29]], <4 x i1> [[ACTIVE_LANE_MASK17]])
 ; CHECK-NEXT:    [[SCEVGEP25]] = getelementptr i32, ptr [[LSR_IV]], i32 16
 ; CHECK-NEXT:    [[SCEVGEP32]] = getelementptr i32, ptr [[LSR_IV31]], i32 16
 ; CHECK-NEXT:    [[SCEVGEP39]] = getelementptr i32, ptr [[LSR_IV38]], i32 16
 ; CHECK-NEXT:    [[V14]] = add i32 [[V9]], 4
 ; CHECK-NEXT:    [[V15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[V6]], i32 1)
 ; CHECK-NEXT:    [[V16:%.*]] = icmp ne i32 [[V15]], 0
 ; CHECK-NEXT:    br i1 [[V16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp8 = icmp sgt i32 %N, 0
   %v0 = add i32 %N, 15
   %v1 = lshr i32 %v0, 4
   %v2 = shl nuw i32 %v1, 4
   %v3 = add i32 %v2, -16
   %v4 = lshr i32 %v3, 4
   %v5 = add nuw nsw i32 %v4, 1
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup


 vector.ph:
   %scevgep = getelementptr i32, ptr %A, i32 8
   %scevgep30 = getelementptr i32, ptr %C, i32 8
   %scevgep37 = getelementptr i32, ptr %B, i32 8
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %v5)
   br label %vector.body

 vector.body:
   %lsr.iv38 = phi ptr [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ]
   %lsr.iv31 = phi ptr [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ]
   %lsr.iv = phi ptr [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ]
   %v6 = phi i32 [ %start, %vector.ph ], [ %v15, %vector.body ]
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %v7 = add i32 %index, 4
   %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
   %v8 = add i32 %v7, 4
   %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
   %v9 = add i32 %v8, 4
   %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
   %scevgep42 = getelementptr <4 x i32>, ptr %lsr.iv38, i32 -2
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %scevgep43 = getelementptr <4 x i32>, ptr %lsr.iv38, i32 -1
   %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
   %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %lsr.iv38, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
   %scevgep41 = getelementptr <4 x i32>, ptr %lsr.iv38, i32 1
   %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
   %scevgep34 = getelementptr <4 x i32>, ptr %lsr.iv31, i32 -2
   %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %scevgep35 = getelementptr <4 x i32>, ptr %lsr.iv31, i32 -1
   %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
   %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %lsr.iv31, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
   %scevgep36 = getelementptr <4 x i32>, ptr %lsr.iv31, i32 1
   %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
   %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load
   %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18
   %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19
   %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20
   %scevgep27 = getelementptr <4 x i32>, ptr %lsr.iv, i32 -2
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %v10, ptr %scevgep27, i32 4, <4 x i1> %active.lane.mask)
   %scevgep28 = getelementptr <4 x i32>, ptr %lsr.iv, i32 -1
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %v11, ptr %scevgep28, i32 4, <4 x i1> %active.lane.mask15)
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %v12, ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask16)
   %scevgep29 = getelementptr <4 x i32>, ptr %lsr.iv, i32 1
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %v13, ptr %scevgep29, i32 4, <4 x i1> %active.lane.mask17)
   %scevgep25 = getelementptr i32, ptr %lsr.iv, i32 16
   %scevgep32 = getelementptr i32, ptr %lsr.iv31, i32 16
   %scevgep39 = getelementptr i32, ptr %lsr.iv38, i32 16
   %v14 = add i32 %v9, 4
   %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1)
   %v16 = icmp ne i32 %v15, 0
   br i1 %v16, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:
   ret void
 }

 define dso_local void @const_expected_in_set_loop(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: define dso_local void @const_expected_in_set_loop(
 ; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -4
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1
 ; CHECK-NEXT:    br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 42)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV14]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr align 4 [[LSR_IV17]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4
 ; CHECK-NEXT:    [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp8 = icmp sgt i32 %N, 0
   %0 = add i32 %N, 3
   %1 = lshr i32 %0, 2
   %2 = shl nuw i32 %1, 2
   %3 = add i32 %2, -4
   %4 = lshr i32 %3, 2
   %5 = add nuw nsw i32 %4, 1
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

 vector.ph:
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42)
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv14, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %7, ptr %lsr.iv17, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
   %scevgep18 = getelementptr i32, ptr %lsr.iv17, i32 4
   %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
   %9 = icmp ne i32 %8, 0
   br i1 %9, label %vector.body, label %for.cond.cleanup

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 define dso_local void @tripcount_arg_not_invariant(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: define dso_local void @tripcount_arg_not_invariant(
 ; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -4
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1
 ; CHECK-NEXT:    br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[INDEX]])
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV14]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr align 4 [[LSR_IV17]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4
 ; CHECK-NEXT:    [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[VECTOR_PH]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp8 = icmp sgt i32 %N, 0
   %0 = add i32 %N, 3
   %1 = lshr i32 %0, 2
   %2 = shl nuw i32 %1, 2
   %3 = add i32 %2, -4
   %4 = lshr i32 %3, 2
   %5 = add nuw nsw i32 %4, 1
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

 vector.ph:                                        ; preds = %entry
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]

   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv14, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %7, ptr %lsr.iv17, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
   %scevgep18 = getelementptr i32, ptr %lsr.iv17, i32 4
   %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
   %9 = icmp ne i32 %8, 0
   ;br i1 %9, label %vector.body, label %for.cond.cleanup
   br i1 %9, label %vector.body, label %vector.ph

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }

 define dso_local void @addrec_base_not_zero(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
 ; CHECK-LABEL: define dso_local void @addrec_base_not_zero(
 ; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -4
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1
 ; CHECK-NEXT:    br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 1, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV14]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr align 4 [[LSR_IV17]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4
 ; CHECK-NEXT:    [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[VECTOR_PH]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp8 = icmp sgt i32 %N, 0
   %0 = add i32 %N, 3
   %1 = lshr i32 %0, 2
   %2 = shl nuw i32 %1, 2
   %3 = add i32 %2, -4
   %4 = lshr i32 %3, 2
   %5 = add nuw nsw i32 %4, 1
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

 vector.ph:                                        ; preds = %entry
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body

 vector.body:                                      ; preds = %vector.body, %vector.ph
   %lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]

 ; AddRec base is not 0:
   %index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ]

   %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv14, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %7, ptr %lsr.iv17, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
   %scevgep18 = getelementptr i32, ptr %lsr.iv17, i32 4
   %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
   %9 = icmp ne i32 %8, 0
   ;br i1 %9, label %vector.body, label %for.cond.cleanup
   br i1 %9, label %vector.body, label %vector.ph

 for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }


 declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>)
 declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>)
 declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>)
 declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>)
 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
 declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32 immarg, <2 x i1>)
 declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32 immarg, <2 x i1>, <2 x i64>)
 declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
 declare i32 @llvm.start.loop.iterations.i32(i32)
 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)