[SLP]Add a check if the user itself is commutable

If the commutable instruction can be represented as a non-commutable
vector instruction (like add 0, %v can be represented as a part of sub
nodes with operation sub %v, 0), its operands might still be reordered
and this should be accounted when checking for copyables in operands

Fixes #158293
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7ca43ef..8aafe14 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5253,6 +5253,7 @@
           // Same applies even for non-commutative cmps, because we can invert
           // their predicate potentially and, thus, reorder the operands.
           bool IsCommutativeUser =
+              ::isCommutative(User) ||
               ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
           EdgeInfo EI(TE, U.getOperandNo());
           if (!IsCommutativeUser && !isa<CmpInst>(User)) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutable-member-in-non-commutable-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutable-member-in-non-commutable-node.ll
new file mode 100644
index 0000000..adceef1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/commutable-member-in-non-commutable-node.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt --passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s
+
+define i64 @test(i32 %arg) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 896), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ARG]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 896), align 4
+; CHECK-NEXT:    ret i64 0
+;
+bb:
+  %load = load i32, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 900), align 4
+  %add = add i32 0, %load
+  store i32 %add, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 900), align 4
+  %load1 = load i32, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 896), align 4
+  %add2 = add i32 %load1, 0
+  %sub = sub i32 %add2, %arg
+  store i32 %sub, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) null, i64 896), align 4
+  ret i64 0
+}