[SLP]Pre-cache the last instruction for all entries before vectorization

Need to pre-cache last instruction to avoid unexpected changes in the
last instruction detection during the vectorization, caused by adding
the new vector instructions, which add new uses and may affect the
analysis.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 83252bd..41ad430 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -18414,8 +18414,14 @@
   // need to rebuild it.
   EntryToLastInstruction.clear();
   // All blocks must be scheduled before any instructions are inserted.
-  for (auto &BSIter : BlocksSchedules) {
+  for (auto &BSIter : BlocksSchedules)
     scheduleBlock(BSIter.second.get());
+  // Cache last instructions for the nodes to avoid side effects, which may
+  // appear during vectorization, like extra uses, etc.
+  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+    if (TE->isGather())
+      continue;
+    (void)getLastInstructionInBundle(TE.get());
   }
 
   if (ReductionRoot)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
new file mode 100644
index 0000000..9d48e7f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-generic-linux-gnu < %s | FileCheck %s
+
+define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[NEXP:%.*]], float [[TMP0:%.*]], i1 [[CMP:%.*]], float [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0x7FF8000000000000, float poison, float poison>, float [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ <float 0x7FF8000000000000, float 1.000000e+00>, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[CALL25:%.*]] = load volatile ptr, ptr null, align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x float> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x float> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <4 x float> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP26:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> <float 0.000000e+00, float 1.000000e+00, float poison, float poison>, <2 x float> [[TMP22]], i64 2)
+; CHECK-NEXT:    [[TMP27:%.*]] = fadd <4 x float> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[CALL25]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %div.i41 = fmul float %0, 0.000000e+00
+  %2 = load float, ptr %nExp, align 4
+  %div.1.i.i = fmul float %2, 0.000000e+00
+  %div.2.i.i = fmul float %0, 0.000000e+00
+  br label %if.end
+
+if.end:
+  %3 = phi float [ %1, %if.then ], [ %0, %entry ]
+  %4 = phi float [ 0.000000e+00, %if.then ], [ %1, %entry ]
+  %5 = phi float [ 0.000000e+00, %if.then ], [ 0x7FF8000000000000, %entry ]
+  %6 = phi float [ 0.000000e+00, %if.then ], [ 1.000000e+00, %entry ]
+  %fa.sroa.9.0 = phi float [ %div.2.i.i, %if.then ], [ 0.000000e+00, %entry ]
+  %fa.sroa.7.0 = phi float [ %div.1.i.i, %if.then ], [ 0.000000e+00, %entry ]
+  %fa.sroa.0.0 = phi float [ %div.i41, %if.then ], [ 0.000000e+00, %entry ]
+  %mul.1.i.i58 = fmul float %fa.sroa.7.0, %6
+  %mul.2.i.i60 = fmul float %fa.sroa.9.0, %6
+  %mul.1.i.i.i63 = fmul float %fa.sroa.0.0, %5
+  %mul.2.i.i.i65 = fmul float %fa.sroa.0.0, 0.000000e+00
+  %mul.i66 = fmul float %fa.sroa.0.0, 0.000000e+00
+  %add.1.i.i = fadd float %mul.1.i.i58, %mul.1.i.i.i63
+  %add.2.i.i = fadd float %mul.2.i.i60, %mul.2.i.i.i65
+  %mul.1.i.i74 = fmul float %add.1.i.i, 0.000000e+00
+  %mul.2.i.i76 = fmul float %add.2.i.i, 0.000000e+00
+  %mul.i.i.i78 = fmul float %mul.i66, 0.000000e+00
+  %add.1.i.i85 = fadd float %mul.1.i.i74, 0.000000e+00
+  %add.2.i.i86 = fadd float %mul.2.i.i76, 0.000000e+00
+  %mul.i.i.i97 = fmul float %5, 0.000000e+00
+  %mul.1.i.i.i99 = fmul float %4, 0.000000e+00
+  %mul.2.i.i.i101 = fmul float %3, 0.000000e+00
+  %add.i.i103 = fadd float %mul.i.i.i97, 0.000000e+00
+  %add.1.i.i104 = fadd float %mul.1.i.i.i99, 0.000000e+00
+  %add.2.i.i105 = fadd float %mul.2.i.i.i101, 0.000000e+00
+  %add = fadd float %mul.i.i.i78, 0.000000e+00
+  %add.i = fadd float %add.i.i103, 1.000000e+00
+  %add.1.i = fadd float %add.1.i.i104, %add.1.i.i85
+  %add.2.i = fadd float %add.2.i.i105, %add.2.i.i86
+  %call25 = load volatile ptr, ptr null, align 8
+  store float %add, ptr %call25, align 4
+  %__trans_tmp_29.sroa.5.0.call25.sroa_idx = getelementptr i8, ptr %call25, i64 4
+  store float %add.i, ptr %__trans_tmp_29.sroa.5.0.call25.sroa_idx, align 4
+  %__trans_tmp_29.sroa.6.0.call25.sroa_idx = getelementptr i8, ptr %call25, i64 8
+  store float %add.1.i, ptr %__trans_tmp_29.sroa.6.0.call25.sroa_idx, align 4
+  %__trans_tmp_29.sroa.7.0.call25.sroa_idx = getelementptr i8, ptr %call25, i64 12
+  store float %add.2.i, ptr %__trans_tmp_29.sroa.7.0.call25.sroa_idx, align 4
+  ret void
+}