[SLP]Pre-cache the last instruction for all entries before vectorization Need to pre-cache last instruction to avoid unexpected changes in the last instruction detection during the vectorization, caused by adding the new vector instructions, which add new uses and may affect the analysis.

commit: 4aca20c8b6dcf86696db03d860e635112601a7f9 [log] [tgz]
author: Alexey Bataev <a.bataev@outlook.com> Wed Apr 16 11:42:56 2025 -0700
committer: Alexey Bataev <a.bataev@outlook.com> Wed Apr 16 11:44:55 2025 -0700
tree: 7331577365f558f1123b657036b0cf314aacc205
parent: e77ef7b291a0024ae34eaa76dafb62aef06d3c95 [diff]
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 83252bd..41ad430 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -18414,8 +18414,14 @@
   // need to rebuild it.
   EntryToLastInstruction.clear();
   // All blocks must be scheduled before any instructions are inserted.
-  for (auto &BSIter : BlocksSchedules) {
+  for (auto &BSIter : BlocksSchedules)
     scheduleBlock(BSIter.second.get());
+  // Cache last instructions for the nodes to avoid side effects, which may
+  // appear during vectorization, like extra uses, etc.
+  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+    if (TE->isGather())
+      continue;
+    (void)getLastInstructionInBundle(TE.get());
   }
 
   if (ReductionRoot)

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
new file mode 100644
index 0000000..9d48e7f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll

@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-generic-linux-gnu < %s | FileCheck %s
+
+define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[NEXP:%.*]], float [[TMP0:%.*]], i1 [[CMP:%.*]], float [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0x7FF8000000000000, float poison, float poison>, float [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ <float 0x7FF8000000000000, float 1.000000e+00>, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[CALL25:%.*]] = load volatile ptr, ptr null, align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x float> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x float> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <4 x float> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP26:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> <float 0.000000e+00, float 1.000000e+00, float poison, float poison>, <2 x float> [[TMP22]], i64 2)
+; CHECK-NEXT:    [[TMP27:%.*]] = fadd <4 x float> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[CALL25]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %div.i41 = fmul float %0, 0.000000e+00
+  %2 = load float, ptr %nExp, align 4
+  %div.1.i.i = fmul float %2, 0.000000e+00
+  %div.2.i.i = fmul float %0, 0.000000e+00
+  br label %if.end
+
+if.end:
+  %3 = phi float [ %1, %if.then ], [ %0, %entry ]
+  %4 = phi float [ 0.000000e+00, %if.then ], [ %1, %entry ]
+  %5 = phi float [ 0.000000e+00, %if.then ], [ 0x7FF8000000000000, %entry ]
+  %6 = phi float [ 0.000000e+00, %if.then ], [ 1.000000e+00, %entry ]
+  %fa.sroa.9.0 = phi float [ %div.2.i.i, %if.then ], [ 0.000000e+00, %entry ]
+  %fa.sroa.7.0 = phi float [ %div.1.i.i, %if.then ], [ 0.000000e+00, %entry ]
+  %fa.sroa.0.0 = phi float [ %div.i41, %if.then ], [ 0.000000e+00, %entry ]
+  %mul.1.i.i58 = fmul float %fa.sroa.7.0, %6
+  %mul.2.i.i60 = fmul float %fa.sroa.9.0, %6
+  %mul.1.i.i.i63 = fmul float %fa.sroa.0.0, %5
+  %mul.2.i.i.i65 = fmul float %fa.sroa.0.0, 0.000000e+00
+  %mul.i66 = fmul float %fa.sroa.0.0, 0.000000e+00
+  %add.1.i.i = fadd float %mul.1.i.i58, %mul.1.i.i.i63
+  %add.2.i.i = fadd float %mul.2.i.i60, %mul.2.i.i.i65
+  %mul.1.i.i74 = fmul float %add.1.i.i, 0.000000e+00
+  %mul.2.i.i76 = fmul float %add.2.i.i, 0.000000e+00
+  %mul.i.i.i78 = fmul float %mul.i66, 0.000000e+00
+  %add.1.i.i85 = fadd float %mul.1.i.i74, 0.000000e+00
+  %add.2.i.i86 = fadd float %mul.2.i.i76, 0.000000e+00
+  %mul.i.i.i97 = fmul float %5, 0.000000e+00
+  %mul.1.i.i.i99 = fmul float %4, 0.000000e+00
+  %mul.2.i.i.i101 = fmul float %3, 0.000000e+00
+  %add.i.i103 = fadd float %mul.i.i.i97, 0.000000e+00
+  %add.1.i.i104 = fadd float %mul.1.i.i.i99, 0.000000e+00
+  %add.2.i.i105 = fadd float %mul.2.i.i.i101, 0.000000e+00
+  %add = fadd float %mul.i.i.i78, 0.000000e+00
+  %add.i = fadd float %add.i.i103, 1.000000e+00
+  %add.1.i = fadd float %add.1.i.i104, %add.1.i.i85
+  %add.2.i = fadd float %add.2.i.i105, %add.2.i.i86
+  %call25 = load volatile ptr, ptr null, align 8
+  store float %add, ptr %call25, align 4
+  %__trans_tmp_29.sroa.5.0.call25.sroa_idx = getelementptr i8, ptr %call25, i64 4
+  store float %add.i, ptr %__trans_tmp_29.sroa.5.0.call25.sroa_idx, align 4
+  %__trans_tmp_29.sroa.6.0.call25.sroa_idx = getelementptr i8, ptr %call25, i64 8
+  store float %add.1.i, ptr %__trans_tmp_29.sroa.6.0.call25.sroa_idx, align 4
+  %__trans_tmp_29.sroa.7.0.call25.sroa_idx = getelementptr i8, ptr %call25, i64 12
+  store float %add.2.i, ptr %__trans_tmp_29.sroa.7.0.call25.sroa_idx, align 4
+  ret void
+}
commit	4aca20c8b6dcf86696db03d860e635112601a7f9	[log] [tgz]
author	Alexey Bataev <a.bataev@outlook.com>	Wed Apr 16 11:42:56 2025 -0700
committer	Alexey Bataev <a.bataev@outlook.com>	Wed Apr 16 11:44:55 2025 -0700
tree	7331577365f558f1123b657036b0cf314aacc205
parent	e77ef7b291a0024ae34eaa76dafb62aef06d3c95 [diff]