[PhaseOrdering][X86] Ensure middleend has equivalent addsub pattern test coverage to backend (#164163)
Small step towards #144489
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
index a3af048..2c1d73e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
@@ -12,6 +12,400 @@
; That may require some coordination between VectorCombine, SLP, and other passes.
; The end goal is to get a single "vaddsubps" instruction for x86 with AVX.
+define <2 x double> @test_addsub_v2f64(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: @test_addsub_v2f64(
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: ret <2 x double> [[TMP3]]
+;
+ %1 = extractelement <2 x double> %A, i32 0
+ %2 = extractelement <2 x double> %B, i32 0
+ %sub = fsub double %1, %2
+ %3 = extractelement <2 x double> %A, i32 1
+ %4 = extractelement <2 x double> %B, i32 1
+ %add = fadd double %3, %4
+ %vecinsert1 = insertelement <2 x double> poison, double %sub, i32 0
+ %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
+ ret <2 x double> %vecinsert2
+}
+
+define <4 x double> @test_addsub_v4f64(<4 x double> %A, <4 x double> %B) {
+; CHECK-LABEL: @test_addsub_v4f64(
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x double> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: ret <4 x double> [[TMP3]]
+;
+ %1 = extractelement <4 x double> %A, i32 0
+ %2 = extractelement <4 x double> %B, i32 0
+ %sub = fsub double %1, %2
+ %3 = extractelement <4 x double> %A, i32 2
+ %4 = extractelement <4 x double> %B, i32 2
+ %sub2 = fsub double %3, %4
+ %5 = extractelement <4 x double> %A, i32 1
+ %6 = extractelement <4 x double> %B, i32 1
+ %add = fadd double %5, %6
+ %7 = extractelement <4 x double> %A, i32 3
+ %8 = extractelement <4 x double> %B, i32 3
+ %add2 = fadd double %7, %8
+ %vecinsert1 = insertelement <4 x double> poison, double %add, i32 1
+ %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
+ %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
+ %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
+ ret <4 x double> %vecinsert4
+}
+
+define <8 x double> @test_addsub_v8f64(<8 x double> %A, <8 x double> %B) {
+; SSE2-LABEL: @test_addsub_v8f64(
+; SSE2-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT: ret <8 x double> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v8f64(
+; SSE4-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT: [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT: ret <8 x double> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v8f64(
+; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT: [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT: ret <8 x double> [[TMP3]]
+;
+ %1 = extractelement <8 x double> %A, i32 0
+ %2 = extractelement <8 x double> %B, i32 0
+ %sub = fsub double %1, %2
+ %3 = extractelement <8 x double> %A, i32 2
+ %4 = extractelement <8 x double> %B, i32 2
+ %sub2 = fsub double %3, %4
+ %5 = extractelement <8 x double> %A, i32 1
+ %6 = extractelement <8 x double> %B, i32 1
+ %add = fadd double %5, %6
+ %7 = extractelement <8 x double> %A, i32 3
+ %8 = extractelement <8 x double> %B, i32 3
+ %add2 = fadd double %7, %8
+ %9 = extractelement <8 x double> %A, i32 4
+ %10 = extractelement <8 x double> %B, i32 4
+ %sub3 = fsub double %9, %10
+ %11 = extractelement <8 x double> %A, i32 6
+ %12 = extractelement <8 x double> %B, i32 6
+ %sub4 = fsub double %11, %12
+ %13 = extractelement <8 x double> %A, i32 5
+ %14 = extractelement <8 x double> %B, i32 5
+ %add3 = fadd double %13, %14
+ %15 = extractelement <8 x double> %A, i32 7
+ %16 = extractelement <8 x double> %B, i32 7
+ %add4 = fadd double %15, %16
+ %vecinsert1 = insertelement <8 x double> poison, double %add, i32 1
+ %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add2, i32 3
+ %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub, i32 0
+ %vecinsert4 = insertelement <8 x double> %vecinsert3, double %sub2, i32 2
+ %vecinsert5 = insertelement <8 x double> %vecinsert4, double %add3, i32 5
+ %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add4, i32 7
+ %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub3, i32 4
+ %vecinsert8 = insertelement <8 x double> %vecinsert7, double %sub4, i32 6
+ ret <8 x double> %vecinsert8
+}
+
+define <2 x float> @test_addsub_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; CHECK-LABEL: @test_addsub_v2f32(
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> [[V0:%.*]], [[V1:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: ret <2 x float> [[TMP3]]
+;
+ %v2 = extractelement <2 x float> %v0, i32 0
+ %v3 = extractelement <2 x float> %v1, i32 0
+ %v4 = extractelement <2 x float> %v0, i32 1
+ %v5 = extractelement <2 x float> %v1, i32 1
+ %sub = fsub float %v2, %v3
+ %add = fadd float %v5, %v4
+ %res0 = insertelement <2 x float> poison, float %sub, i32 0
+ %res1 = insertelement <2 x float> %res0, float %add, i32 1
+ ret <2 x float> %res1
+}
+
+define <4 x float> @test_addsub_v4f32(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32(
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 2
+ %4 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <4 x float> %A, i32 1
+ %6 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %5, %6
+ %7 = extractelement <4 x float> %A, i32 3
+ %8 = extractelement <4 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %vecinsert1 = insertelement <4 x float> poison, float %add, i32 1
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+ ret <4 x float> %vecinsert4
+}
+
+define <8 x float> @test_v8f32(<8 x float> %A, <8 x float> %B) {
+; SSE2-LABEL: @test_v8f32(
+; SSE2-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT: ret <8 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_v8f32(
+; SSE4-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT: ret <8 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT: ret <8 x float> [[TMP3]]
+;
+ %1 = extractelement <8 x float> %A, i32 0
+ %2 = extractelement <8 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <8 x float> %A, i32 2
+ %4 = extractelement <8 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <8 x float> %A, i32 1
+ %6 = extractelement <8 x float> %B, i32 1
+ %add = fadd float %5, %6
+ %7 = extractelement <8 x float> %A, i32 3
+ %8 = extractelement <8 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %9 = extractelement <8 x float> %A, i32 4
+ %10 = extractelement <8 x float> %B, i32 4
+ %sub3 = fsub float %9, %10
+ %11 = extractelement <8 x float> %A, i32 6
+ %12 = extractelement <8 x float> %B, i32 6
+ %sub4 = fsub float %11, %12
+ %13 = extractelement <8 x float> %A, i32 5
+ %14 = extractelement <8 x float> %B, i32 5
+ %add3 = fadd float %13, %14
+ %15 = extractelement <8 x float> %A, i32 7
+ %16 = extractelement <8 x float> %B, i32 7
+ %add4 = fadd float %15, %16
+ %vecinsert1 = insertelement <8 x float> poison, float %add, i32 1
+ %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
+ %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
+ %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
+ %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
+ %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
+ ret <8 x float> %vecinsert8
+}
+
+define <16 x float> @test_addsub_v16f32(<16 x float> %A, <16 x float> %B) {
+; SSE2-LABEL: @test_addsub_v16f32(
+; SSE2-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP4]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; SSE2-NEXT: ret <16 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v16f32(
+; SSE4-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT: [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; SSE4-NEXT: ret <16 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v16f32(
+; AVX-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT: [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; AVX-NEXT: ret <16 x float> [[TMP3]]
+;
+ %1 = extractelement <16 x float> %A, i32 0
+ %2 = extractelement <16 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <16 x float> %A, i32 2
+ %4 = extractelement <16 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <16 x float> %A, i32 1
+ %6 = extractelement <16 x float> %B, i32 1
+ %add = fadd float %5, %6
+ %7 = extractelement <16 x float> %A, i32 3
+ %8 = extractelement <16 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %9 = extractelement <16 x float> %A, i32 4
+ %10 = extractelement <16 x float> %B, i32 4
+ %sub3 = fsub float %9, %10
+ %11 = extractelement <16 x float> %A, i32 6
+ %12 = extractelement <16 x float> %B, i32 6
+ %sub4 = fsub float %11, %12
+ %13 = extractelement <16 x float> %A, i32 5
+ %14 = extractelement <16 x float> %B, i32 5
+ %add3 = fadd float %13, %14
+ %15 = extractelement <16 x float> %A, i32 7
+ %16 = extractelement <16 x float> %B, i32 7
+ %add4 = fadd float %15, %16
+ %17 = extractelement <16 x float> %A, i32 8
+ %18 = extractelement <16 x float> %B, i32 8
+ %sub5 = fsub float %17, %18
+ %19 = extractelement <16 x float> %A, i32 10
+ %20 = extractelement <16 x float> %B, i32 10
+ %sub6 = fsub float %19, %20
+ %21 = extractelement <16 x float> %A, i32 9
+ %22 = extractelement <16 x float> %B, i32 9
+ %add5 = fadd float %21, %22
+ %23 = extractelement <16 x float> %A, i32 11
+ %24 = extractelement <16 x float> %B, i32 11
+ %add6 = fadd float %23, %24
+ %25 = extractelement <16 x float> %A, i32 12
+ %26 = extractelement <16 x float> %B, i32 12
+ %sub7 = fsub float %25, %26
+ %27 = extractelement <16 x float> %A, i32 14
+ %28 = extractelement <16 x float> %B, i32 14
+ %sub8 = fsub float %27, %28
+ %29 = extractelement <16 x float> %A, i32 13
+ %30 = extractelement <16 x float> %B, i32 13
+ %add7 = fadd float %29, %30
+ %31 = extractelement <16 x float> %A, i32 15
+ %32 = extractelement <16 x float> %B, i32 15
+ %add8 = fadd float %31, %32
+ %vecinsert1 = insertelement <16 x float> poison, float %add, i32 1
+ %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <16 x float> %vecinsert3, float %sub2, i32 2
+ %vecinsert5 = insertelement <16 x float> %vecinsert4, float %add3, i32 5
+ %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add4, i32 7
+ %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub3, i32 4
+ %vecinsert8 = insertelement <16 x float> %vecinsert7, float %sub4, i32 6
+ %vecinsert9 = insertelement <16 x float> %vecinsert8, float %add5, i32 9
+ %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add6, i32 11
+ %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub5, i32 8
+ %vecinsert12 = insertelement <16 x float> %vecinsert11, float %sub6, i32 10
+ %vecinsert13 = insertelement <16 x float> %vecinsert12, float %add7, i32 13
+ %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add8, i32 15
+ %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub7, i32 12
+ %vecinsert16 = insertelement <16 x float> %vecinsert15, float %sub8, i32 14
+ ret <16 x float> %vecinsert16
+}
+
+; Test that non-sequential / partial add-sub patterns are still folded.
+
+define <4 x float> @test_addsub_v4f32_shuffle_1302(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_shuffle_1302(
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 2
+ %4 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <4 x float> %A, i32 1
+ %6 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %5, %6
+ %7 = extractelement <4 x float> %A, i32 3
+ %8 = extractelement <4 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %vecinsert1 = insertelement <4 x float> poison, float %add, i32 1
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+ ret <4 x float> %vecinsert4
+}
+
+define <4 x float> @test_addsub_v4f32_partial_23(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_23(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[VECINSERT21:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
+; CHECK-NEXT: ret <4 x float> [[VECINSERT21]]
+;
+ %1 = extractelement <4 x float> %A, i32 2
+ %2 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 3
+ %4 = extractelement <4 x float> %B, i32 3
+ %add2 = fadd float %3, %4
+ %vecinsert1 = insertelement <4 x float> poison, float %sub2, i32 2
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+ ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_03(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_03(
+; CHECK-NEXT: [[FOLDEXTEXTBINOP:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[FOLDEXTEXTBINOP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT: [[VECINSERT2:%.*]] = shufflevector <4 x float> [[FOLDEXTEXTBINOP]], <4 x float> [[FOLDEXTEXTBINOP2]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 7>
+; CHECK-NEXT: ret <4 x float> [[VECINSERT2]]
+;
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 3
+ %4 = extractelement <4 x float> %B, i32 3
+ %add = fadd float %4, %3
+ %vecinsert1 = insertelement <4 x float> poison, float %sub, i32 0
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
+ ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_12(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_12(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[VECINSERT21:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 poison, i32 0, i32 3, i32 poison>
+; CHECK-NEXT: ret <4 x float> [[VECINSERT21]]
+;
+ %1 = extractelement <4 x float> %A, i32 2
+ %2 = extractelement <4 x float> %B, i32 2
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 1
+ %4 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %3, %4
+ %vecinsert1 = insertelement <4 x float> poison, float %sub, i32 2
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
+ ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_01(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_01(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: ret <4 x float> [[TMP6]]
+;
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub2 = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 1
+ %4 = extractelement <4 x float> %B, i32 1
+ %add2 = fadd float %3, %4
+ %vecinsert1 = insertelement <4 x float> poison, float %sub2, i32 0
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
+ ret <4 x float> %vecinsert2
+}
+
define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
; CHECK-LABEL: @PR45015(
; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
index 40dc2aa..fa6403f 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
@@ -12,6 +12,404 @@
; That may require some coordination between VectorCombine, SLP, and other passes.
; The end goal is to get a single "vaddsubps" instruction for x86 with AVX.
+define <2 x double> @test_addsub_v2f64(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: @test_addsub_v2f64(
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: ret <2 x double> [[TMP3]]
+;
+ %1 = extractelement <2 x double> %A, i32 0
+ %2 = extractelement <2 x double> %B, i32 0
+ %sub = fsub double %1, %2
+ %3 = extractelement <2 x double> %A, i32 1
+ %4 = extractelement <2 x double> %B, i32 1
+ %add = fadd double %3, %4
+ %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0
+ %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
+ ret <2 x double> %vecinsert2
+}
+
+define <4 x double> @test_addsub_v4f64(<4 x double> %A, <4 x double> %B) {
+; CHECK-LABEL: @test_addsub_v4f64(
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x double> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: ret <4 x double> [[TMP3]]
+;
+ %1 = extractelement <4 x double> %A, i32 0
+ %2 = extractelement <4 x double> %B, i32 0
+ %sub = fsub double %1, %2
+ %3 = extractelement <4 x double> %A, i32 2
+ %4 = extractelement <4 x double> %B, i32 2
+ %sub2 = fsub double %3, %4
+ %5 = extractelement <4 x double> %A, i32 1
+ %6 = extractelement <4 x double> %B, i32 1
+ %add = fadd double %5, %6
+ %7 = extractelement <4 x double> %A, i32 3
+ %8 = extractelement <4 x double> %B, i32 3
+ %add2 = fadd double %7, %8
+ %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1
+ %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
+ %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
+ %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
+ ret <4 x double> %vecinsert4
+}
+
+define <8 x double> @test_addsub_v8f64(<8 x double> %A, <8 x double> %B) {
+; SSE2-LABEL: @test_addsub_v8f64(
+; SSE2-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT: ret <8 x double> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v8f64(
+; SSE4-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT: [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT: ret <8 x double> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v8f64(
+; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT: [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT: ret <8 x double> [[TMP3]]
+;
+ %1 = extractelement <8 x double> %A, i32 0
+ %2 = extractelement <8 x double> %B, i32 0
+ %sub = fsub double %1, %2
+ %3 = extractelement <8 x double> %A, i32 2
+ %4 = extractelement <8 x double> %B, i32 2
+ %sub2 = fsub double %3, %4
+ %5 = extractelement <8 x double> %A, i32 1
+ %6 = extractelement <8 x double> %B, i32 1
+ %add = fadd double %5, %6
+ %7 = extractelement <8 x double> %A, i32 3
+ %8 = extractelement <8 x double> %B, i32 3
+ %add2 = fadd double %7, %8
+ %9 = extractelement <8 x double> %A, i32 4
+ %10 = extractelement <8 x double> %B, i32 4
+ %sub3 = fsub double %9, %10
+ %11 = extractelement <8 x double> %A, i32 6
+ %12 = extractelement <8 x double> %B, i32 6
+ %sub4 = fsub double %11, %12
+ %13 = extractelement <8 x double> %A, i32 5
+ %14 = extractelement <8 x double> %B, i32 5
+ %add3 = fadd double %13, %14
+ %15 = extractelement <8 x double> %A, i32 7
+ %16 = extractelement <8 x double> %B, i32 7
+ %add4 = fadd double %15, %16
+ %vecinsert1 = insertelement <8 x double> undef, double %add, i32 1
+ %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add2, i32 3
+ %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub, i32 0
+ %vecinsert4 = insertelement <8 x double> %vecinsert3, double %sub2, i32 2
+ %vecinsert5 = insertelement <8 x double> %vecinsert4, double %add3, i32 5
+ %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add4, i32 7
+ %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub3, i32 4
+ %vecinsert8 = insertelement <8 x double> %vecinsert7, double %sub4, i32 6
+ ret <8 x double> %vecinsert8
+}
+
+define <2 x float> @test_addsub_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; CHECK-LABEL: @test_addsub_v2f32(
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> [[V0:%.*]], [[V1:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: ret <2 x float> [[TMP3]]
+;
+ %v2 = extractelement <2 x float> %v0, i32 0
+ %v3 = extractelement <2 x float> %v1, i32 0
+ %v4 = extractelement <2 x float> %v0, i32 1
+ %v5 = extractelement <2 x float> %v1, i32 1
+ %sub = fsub float %v2, %v3
+ %add = fadd float %v5, %v4
+ %res0 = insertelement <2 x float> undef, float %sub, i32 0
+ %res1 = insertelement <2 x float> %res0, float %add, i32 1
+ ret <2 x float> %res1
+}
+
+define <4 x float> @test_addsub_v4f32(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32(
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 2
+ %4 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <4 x float> %A, i32 1
+ %6 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %5, %6
+ %7 = extractelement <4 x float> %A, i32 3
+ %8 = extractelement <4 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+ ret <4 x float> %vecinsert4
+}
+
+define <8 x float> @test_v8f32(<8 x float> %A, <8 x float> %B) {
+; SSE2-LABEL: @test_v8f32(
+; SSE2-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT: ret <8 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_v8f32(
+; SSE4-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT: ret <8 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT: ret <8 x float> [[TMP3]]
+;
+ %1 = extractelement <8 x float> %A, i32 0
+ %2 = extractelement <8 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <8 x float> %A, i32 2
+ %4 = extractelement <8 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <8 x float> %A, i32 1
+ %6 = extractelement <8 x float> %B, i32 1
+ %add = fadd float %5, %6
+ %7 = extractelement <8 x float> %A, i32 3
+ %8 = extractelement <8 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %9 = extractelement <8 x float> %A, i32 4
+ %10 = extractelement <8 x float> %B, i32 4
+ %sub3 = fsub float %9, %10
+ %11 = extractelement <8 x float> %A, i32 6
+ %12 = extractelement <8 x float> %B, i32 6
+ %sub4 = fsub float %11, %12
+ %13 = extractelement <8 x float> %A, i32 5
+ %14 = extractelement <8 x float> %B, i32 5
+ %add3 = fadd float %13, %14
+ %15 = extractelement <8 x float> %A, i32 7
+ %16 = extractelement <8 x float> %B, i32 7
+ %add4 = fadd float %15, %16
+ %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1
+ %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
+ %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
+ %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
+ %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
+ %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
+ ret <8 x float> %vecinsert8
+}
+
+define <16 x float> @test_addsub_v16f32(<16 x float> %A, <16 x float> %B) {
+; SSE2-LABEL: @test_addsub_v16f32(
+; SSE2-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP4]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; SSE2-NEXT: ret <16 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v16f32(
+; SSE4-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT: [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; SSE4-NEXT: ret <16 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v16f32(
+; AVX-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT: [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; AVX-NEXT: ret <16 x float> [[TMP3]]
+;
+ %1 = extractelement <16 x float> %A, i32 0
+ %2 = extractelement <16 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <16 x float> %A, i32 2
+ %4 = extractelement <16 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <16 x float> %A, i32 1
+ %6 = extractelement <16 x float> %B, i32 1
+ %add = fadd float %5, %6
+ %7 = extractelement <16 x float> %A, i32 3
+ %8 = extractelement <16 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %9 = extractelement <16 x float> %A, i32 4
+ %10 = extractelement <16 x float> %B, i32 4
+ %sub3 = fsub float %9, %10
+ %11 = extractelement <16 x float> %A, i32 6
+ %12 = extractelement <16 x float> %B, i32 6
+ %sub4 = fsub float %11, %12
+ %13 = extractelement <16 x float> %A, i32 5
+ %14 = extractelement <16 x float> %B, i32 5
+ %add3 = fadd float %13, %14
+ %15 = extractelement <16 x float> %A, i32 7
+ %16 = extractelement <16 x float> %B, i32 7
+ %add4 = fadd float %15, %16
+ %17 = extractelement <16 x float> %A, i32 8
+ %18 = extractelement <16 x float> %B, i32 8
+ %sub5 = fsub float %17, %18
+ %19 = extractelement <16 x float> %A, i32 10
+ %20 = extractelement <16 x float> %B, i32 10
+ %sub6 = fsub float %19, %20
+ %21 = extractelement <16 x float> %A, i32 9
+ %22 = extractelement <16 x float> %B, i32 9
+ %add5 = fadd float %21, %22
+ %23 = extractelement <16 x float> %A, i32 11
+ %24 = extractelement <16 x float> %B, i32 11
+ %add6 = fadd float %23, %24
+ %25 = extractelement <16 x float> %A, i32 12
+ %26 = extractelement <16 x float> %B, i32 12
+ %sub7 = fsub float %25, %26
+ %27 = extractelement <16 x float> %A, i32 14
+ %28 = extractelement <16 x float> %B, i32 14
+ %sub8 = fsub float %27, %28
+ %29 = extractelement <16 x float> %A, i32 13
+ %30 = extractelement <16 x float> %B, i32 13
+ %add7 = fadd float %29, %30
+ %31 = extractelement <16 x float> %A, i32 15
+ %32 = extractelement <16 x float> %B, i32 15
+ %add8 = fadd float %31, %32
+ %vecinsert1 = insertelement <16 x float> undef, float %add, i32 1
+ %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <16 x float> %vecinsert3, float %sub2, i32 2
+ %vecinsert5 = insertelement <16 x float> %vecinsert4, float %add3, i32 5
+ %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add4, i32 7
+ %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub3, i32 4
+ %vecinsert8 = insertelement <16 x float> %vecinsert7, float %sub4, i32 6
+ %vecinsert9 = insertelement <16 x float> %vecinsert8, float %add5, i32 9
+ %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add6, i32 11
+ %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub5, i32 8
+ %vecinsert12 = insertelement <16 x float> %vecinsert11, float %sub6, i32 10
+ %vecinsert13 = insertelement <16 x float> %vecinsert12, float %add7, i32 13
+ %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add8, i32 15
+ %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub7, i32 12
+ %vecinsert16 = insertelement <16 x float> %vecinsert15, float %sub8, i32 14
+ ret <16 x float> %vecinsert16
+}
+
+; Test that non-sequential / partial add-sub patterns are still folded.
+
+define <4 x float> @test_addsub_v4f32_shuffle_1302(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_shuffle_1302(
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 2
+ %4 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <4 x float> %A, i32 1
+ %6 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %5, %6
+ %7 = extractelement <4 x float> %A, i32 3
+ %8 = extractelement <4 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+ ret <4 x float> %vecinsert4
+}
+
+define <4 x float> @test_addsub_v4f32_partial_23(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_23(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINSERT21:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> <float undef, float undef, float poison, float poison>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; CHECK-NEXT: ret <4 x float> [[VECINSERT21]]
+;
+ %1 = extractelement <4 x float> %A, i32 2
+ %2 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 3
+ %4 = extractelement <4 x float> %B, i32 3
+ %add2 = fadd float %3, %4
+ %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+ ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_03(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_03(
+; CHECK-NEXT: [[FOLDEXTEXTBINOP:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[FOLDEXTEXTBINOP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT: [[VECINSERT1:%.*]] = shufflevector <4 x float> [[FOLDEXTEXTBINOP]], <4 x float> <float poison, float undef, float undef, float poison>, <4 x i32> <i32 0, i32 5, i32 6, i32 poison>
+; CHECK-NEXT: [[VECINSERT2:%.*]] = shufflevector <4 x float> [[VECINSERT1]], <4 x float> [[FOLDEXTEXTBINOP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT: ret <4 x float> [[VECINSERT2]]
+;
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 3
+ %4 = extractelement <4 x float> %B, i32 3
+ %add = fadd float %4, %3
+ %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
+ ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_12(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_12(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[VECINSERT21:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> <float undef, float poison, float poison, float undef>, <4 x i32> <i32 4, i32 0, i32 1, i32 7>
+; CHECK-NEXT: ret <4 x float> [[VECINSERT21]]
+;
+ %1 = extractelement <4 x float> %A, i32 2
+ %2 = extractelement <4 x float> %B, i32 2
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 1
+ %4 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %3, %4
+ %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
+ ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_01(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_01(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <4 x float> [[TMP6]]
+;
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub2 = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 1
+ %4 = extractelement <4 x float> %B, i32 1
+ %add2 = fadd float %3, %4
+ %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
+ ret <4 x float> %vecinsert2
+}
+
define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
; CHECK-LABEL: @PR45015(
; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]]