[CostModel][X86] Add explicit fcmp costs for pre-SSE42 targets

Typical throughputs: cmpss/cmpps = 1cy and cmpsd/cmppd = 2cy before the Core2 era

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351684 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 4430079..1d94eed 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1686,12 +1686,19 @@
   };
 
   static const CostTblEntry SSE2CostTbl[] = {
+    { ISD::SETCC,   MVT::v2f64,   2 },
+    { ISD::SETCC,   MVT::f64,     1 },
     { ISD::SETCC,   MVT::v2i64,   8 },
     { ISD::SETCC,   MVT::v4i32,   1 },
     { ISD::SETCC,   MVT::v8i16,   1 },
     { ISD::SETCC,   MVT::v16i8,   1 },
   };
 
+  static const CostTblEntry SSE1CostTbl[] = {
+    { ISD::SETCC,   MVT::v4f32,   2 },
+    { ISD::SETCC,   MVT::f32,     1 },
+  };
+
   if (ST->hasBWI())
     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
@@ -1716,6 +1723,10 @@
     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
 
+  if (ST->hasSSE1())
+    if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
diff --git a/test/Analysis/CostModel/X86/fcmp.ll b/test/Analysis/CostModel/X86/fcmp.ll
index e900dc7..5ec8928 100644
--- a/test/Analysis/CostModel/X86/fcmp.ll
+++ b/test/Analysis/CostModel/X86/fcmp.ll
@@ -16,54 +16,54 @@
 define i32 @cmp_float_oeq(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_oeq'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp oeq float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp oeq <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp oeq <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp oeq <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp oeq <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp oeq <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp oeq <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp oeq <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp oeq <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp oeq double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp oeq <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp oeq <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp oeq <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp oeq <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp oeq <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp oeq <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp oeq <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp oeq <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_oeq'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp oeq float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp oeq <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp oeq <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp oeq <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp oeq <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp oeq <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp oeq <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp oeq <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp oeq <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp oeq double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp oeq <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp oeq <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp oeq <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp oeq <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp oeq <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp oeq <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp oeq <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp oeq <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_oeq'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp oeq float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp oeq <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp oeq <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp oeq <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp oeq <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp oeq <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp oeq <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp oeq <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp oeq <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp oeq double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp oeq <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp oeq <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp oeq <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp oeq <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp oeq <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp oeq <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp oeq <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp oeq <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_oeq'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp oeq float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp oeq <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp oeq <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp oeq <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp oeq <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp oeq <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp oeq <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp oeq <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp oeq <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp oeq double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp oeq <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp oeq <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp oeq <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp oeq <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp oeq <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp oeq <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp oeq <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp oeq <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_oeq'
@@ -136,54 +136,54 @@
 define i32 @cmp_float_one(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_one'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp one float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp one <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp one <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp one <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp one <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp one <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp one <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp one <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp one <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp one double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp one <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp one <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp one <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp one <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp one <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp one <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp one <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp one <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_one'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp one float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp one <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp one <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp one <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp one <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp one <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp one <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp one <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp one <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp one double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp one <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp one <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp one <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp one <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp one <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp one <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp one <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp one <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_one'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp one float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp one <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp one <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp one <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp one <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp one <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp one <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp one <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp one <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp one double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp one <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp one <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp one <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp one <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp one <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp one <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp one <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp one <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_one'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp one float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp one <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp one <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp one <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp one <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp one <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp one <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp one <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp one <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp one double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp one <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp one <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp one <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp one <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp one <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp one <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp one <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp one <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_one'
@@ -256,54 +256,54 @@
 define i32 @cmp_float_ord(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_ord'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ord float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ord <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ord <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ord <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ord <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ord <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ord <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ord <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ord <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ord double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ord <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ord <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ord <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ord <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ord <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ord <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ord <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ord <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_ord'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ord float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ord <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ord <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ord <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ord <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ord <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ord <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ord <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ord <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ord double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ord <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ord <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ord <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ord <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ord <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ord <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ord <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ord <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_ord'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ord float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ord <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ord <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ord <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ord <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ord <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ord <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ord <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ord <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ord double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ord <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ord <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ord <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ord <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ord <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ord <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ord <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ord <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_ord'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ord float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ord <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ord <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ord <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ord <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ord <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ord <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ord <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ord <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ord double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ord <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ord <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ord <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ord <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ord <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ord <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ord <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ord <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_ord'
@@ -376,54 +376,54 @@
 define i32 @cmp_float_oge(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_oge'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp oge float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp oge <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp oge <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp oge <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp oge <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp oge <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp oge <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp oge <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp oge <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp oge double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp oge <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp oge <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp oge <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp oge <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp oge <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp oge <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp oge <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp oge <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_oge'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp oge float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp oge <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp oge <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp oge <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp oge <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp oge <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp oge <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp oge <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp oge <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp oge double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp oge <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp oge <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp oge <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp oge <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp oge <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp oge <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp oge <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp oge <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_oge'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp oge float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp oge <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp oge <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp oge <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp oge <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp oge <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp oge <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp oge <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp oge <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp oge double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp oge <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp oge <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp oge <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp oge <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp oge <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp oge <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp oge <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp oge <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_oge'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp oge float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp oge <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp oge <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp oge <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp oge <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp oge <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp oge <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp oge <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp oge <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp oge double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp oge <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp oge <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp oge <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp oge <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp oge <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp oge <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp oge <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp oge <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_oge'
@@ -496,54 +496,54 @@
 define i32 @cmp_float_ogt(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_ogt'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ogt float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ogt <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ogt <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ogt <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ogt <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ogt <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ogt <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ogt <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ogt <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ogt double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ogt <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ogt <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ogt <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ogt <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ogt <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ogt <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ogt <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ogt <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_ogt'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ogt float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ogt <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ogt <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ogt <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ogt <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ogt <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ogt <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ogt <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ogt <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ogt double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ogt <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ogt <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ogt <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ogt <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ogt <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ogt <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ogt <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ogt <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_ogt'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ogt float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ogt <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ogt <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ogt <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ogt <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ogt <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ogt <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ogt <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ogt <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ogt double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ogt <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ogt <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ogt <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ogt <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ogt <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ogt <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ogt <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ogt <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_ogt'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ogt float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ogt <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ogt <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ogt <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ogt <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ogt <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ogt <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ogt <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ogt <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ogt double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ogt <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ogt <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ogt <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ogt <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ogt <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ogt <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ogt <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ogt <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_ogt'
@@ -616,54 +616,54 @@
 define i32 @cmp_float_ole(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_ole'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ole float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ole <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ole <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ole <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ole <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ole <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ole <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ole <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ole <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ole double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ole <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ole <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ole <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ole <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ole <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ole <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ole <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ole <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_ole'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ole float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ole <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ole <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ole <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ole <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ole <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ole <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ole <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ole <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ole double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ole <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ole <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ole <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ole <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ole <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ole <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ole <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ole <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_ole'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ole float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ole <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ole <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ole <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ole <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ole <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ole <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ole <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ole <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ole double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ole <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ole <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ole <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ole <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ole <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ole <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ole <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ole <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_ole'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ole float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ole <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ole <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ole <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ole <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ole <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ole <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ole <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ole <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ole double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ole <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ole <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ole <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ole <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ole <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ole <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ole <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ole <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_ole'
@@ -736,54 +736,54 @@
 define i32 @cmp_float_olt(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_olt'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp olt float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp olt <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp olt <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp olt <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp olt <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp olt <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp olt <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp olt <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp olt <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp olt double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp olt <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp olt <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp olt <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp olt <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp olt <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp olt <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp olt <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp olt <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_olt'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp olt float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp olt <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp olt <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp olt <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp olt <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp olt <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp olt <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp olt <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp olt <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp olt double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp olt <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp olt <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp olt <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp olt <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp olt <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp olt <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp olt <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp olt <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_olt'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp olt float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp olt <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp olt <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp olt <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp olt <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp olt <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp olt <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp olt <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp olt <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp olt double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp olt <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp olt <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp olt <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp olt <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp olt <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp olt <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp olt <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp olt <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_olt'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp olt float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp olt <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp olt <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp olt <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp olt <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp olt <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp olt <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp olt <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp olt <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp olt double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp olt <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp olt <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp olt <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp olt <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp olt <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp olt <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp olt <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp olt <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_olt'
@@ -856,54 +856,54 @@
 define i32 @cmp_float_ueq(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_ueq'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ueq float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ueq <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ueq <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ueq <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ueq <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ueq <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ueq <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ueq <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ueq <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ueq double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ueq <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ueq <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ueq <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ueq <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ueq <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ueq <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ueq <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ueq <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_ueq'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ueq float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ueq <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ueq <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ueq <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ueq <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ueq <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ueq <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ueq <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ueq <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ueq double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ueq <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ueq <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ueq <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ueq <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ueq <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ueq <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ueq <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ueq <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_ueq'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ueq float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ueq <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ueq <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ueq <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ueq <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ueq <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ueq <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ueq <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ueq <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ueq double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ueq <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ueq <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ueq <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ueq <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ueq <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ueq <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ueq <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ueq <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_ueq'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ueq float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ueq <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ueq <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ueq <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ueq <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ueq <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ueq <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ueq <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ueq <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ueq double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ueq <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ueq <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ueq <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ueq <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ueq <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ueq <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ueq <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ueq <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_ueq'
@@ -976,54 +976,54 @@
 define i32 @cmp_float_une(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_une'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp une float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp une <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp une <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp une <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp une <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp une <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp une <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp une <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp une <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp une double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp une <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp une <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp une <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp une <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp une <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp une <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp une <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp une <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_une'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp une float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp une <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp une <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp une <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp une <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp une <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp une <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp une <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp une <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp une double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp une <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp une <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp une <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp une <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp une <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp une <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp une <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp une <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_une'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp une float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp une <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp une <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp une <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp une <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp une <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp une <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp une <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp une <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp une double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp une <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp une <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp une <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp une <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp une <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp une <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp une <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp une <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_une'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp une float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp une <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp une <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp une <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp une <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp une <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp une <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp une <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp une <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp une double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp une <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp une <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp une <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp une <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp une <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp une <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp une <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp une <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_une'
@@ -1096,54 +1096,54 @@
 define i32 @cmp_float_uno(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_uno'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp uno float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp uno <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp uno <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp uno <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp uno <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp uno <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp uno <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp uno <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp uno <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp uno double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp uno <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp uno <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp uno <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp uno <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp uno <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp uno <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp uno <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp uno <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_uno'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp uno float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp uno <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp uno <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp uno <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp uno <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp uno <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp uno <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp uno <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp uno <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp uno double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp uno <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp uno <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp uno <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp uno <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp uno <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp uno <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp uno <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp uno <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_uno'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp uno float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp uno <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp uno <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp uno <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp uno <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp uno <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp uno <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp uno <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp uno <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp uno double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp uno <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp uno <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp uno <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp uno <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp uno <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp uno <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp uno <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp uno <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_uno'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp uno float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp uno <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp uno <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp uno <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp uno <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp uno <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp uno <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp uno <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp uno <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp uno double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp uno <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp uno <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp uno <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp uno <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp uno <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp uno <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp uno <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp uno <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_uno'
@@ -1216,54 +1216,54 @@
 define i32 @cmp_float_uge(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_uge'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp uge float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp uge <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp uge <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp uge <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp uge <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp uge <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp uge <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp uge <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp uge <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp uge double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp uge <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp uge <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp uge <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp uge <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp uge <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp uge <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp uge <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp uge <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_uge'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp uge float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp uge <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp uge <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp uge <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp uge <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp uge <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp uge <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp uge <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp uge <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp uge double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp uge <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp uge <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp uge <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp uge <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp uge <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp uge <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp uge <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp uge <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_uge'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp uge float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp uge <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp uge <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp uge <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp uge <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp uge <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp uge <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp uge <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp uge <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp uge double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp uge <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp uge <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp uge <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp uge <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp uge <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp uge <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp uge <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp uge <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_uge'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp uge float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp uge <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp uge <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp uge <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp uge <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp uge <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp uge <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp uge <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp uge <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp uge double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp uge <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp uge <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp uge <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp uge <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp uge <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp uge <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp uge <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp uge <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_uge'
@@ -1336,54 +1336,54 @@
 define i32 @cmp_float_ugt(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_ugt'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ugt float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ugt <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ugt <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ugt <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ugt <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ugt <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ugt <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ugt <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ugt <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ugt double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ugt <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ugt <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ugt <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ugt <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ugt <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ugt <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ugt <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ugt <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_ugt'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ugt float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ugt <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ugt <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ugt <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ugt <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ugt <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ugt <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ugt <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ugt <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ugt double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ugt <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ugt <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ugt <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ugt <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ugt <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ugt <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ugt <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ugt <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_ugt'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ugt float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ugt <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ugt <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ugt <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ugt <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ugt <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ugt <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ugt <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ugt <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ugt double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ugt <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ugt <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ugt <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ugt <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ugt <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ugt <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ugt <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ugt <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_ugt'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ugt float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ugt <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ugt <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ugt <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ugt <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ugt <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ugt <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ugt <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ugt <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ugt double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ugt <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ugt <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ugt <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ugt <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ugt <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ugt <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ugt <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ugt <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_ugt'
@@ -1456,54 +1456,54 @@
 define i32 @cmp_float_ule(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_ule'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ule float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ule <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ule <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ule <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ule <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ule <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ule <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ule <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ule <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ule double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ule <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ule <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ule <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ule <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ule <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ule <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ule <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ule <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_ule'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ule float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ule <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ule <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ule <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ule <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ule <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ule <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ule <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ule <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ule double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ule <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ule <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ule <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ule <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ule <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ule <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ule <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ule <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_ule'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ule float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ule <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ule <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ule <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ule <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ule <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ule <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ule <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ule <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ule double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ule <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ule <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ule <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ule <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ule <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ule <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ule <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ule <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_ule'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ule float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ule <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ule <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ule <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ule <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ule <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ule <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ule <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ule <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ule double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ule <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ule <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ule <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ule <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ule <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ule <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ule <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ule <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_ule'
@@ -1576,54 +1576,54 @@
 define i32 @cmp_float_ult(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_ult'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ult float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ult <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ult <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ult <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ult <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ult <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ult <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ult <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ult <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ult double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ult <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ult <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ult <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ult <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ult <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ult <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ult <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ult <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_ult'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ult float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ult <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ult <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ult <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ult <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ult <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ult <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ult <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ult <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ult double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ult <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ult <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ult <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ult <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ult <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ult <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ult <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ult <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_ult'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ult float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ult <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ult <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ult <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ult <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ult <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ult <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ult <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ult <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ult double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ult <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ult <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ult <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ult <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ult <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ult <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ult <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ult <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_ult'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp ult float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp ult <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp ult <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp ult <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp ult <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp ult <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp ult <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp ult <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp ult <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp ult double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp ult <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp ult <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp ult <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp ult <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp ult <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp ult <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp ult <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp ult <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_ult'
@@ -1696,54 +1696,54 @@
 define i32 @cmp_float_false(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_false'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp false float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp false <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp false <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp false <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp false <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp false <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp false <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp false <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp false <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp false double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp false <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp false <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp false <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp false <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp false <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp false <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp false <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp false <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_false'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp false float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp false <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp false <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp false <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp false <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp false <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp false <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp false <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp false <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp false double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp false <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp false <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp false <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp false <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp false <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp false <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp false <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp false <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_false'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp false float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp false <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp false <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp false <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp false <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp false <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp false <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp false <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp false <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp false double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp false <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp false <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp false <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp false <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp false <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp false <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp false <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp false <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_false'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp false float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp false <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp false <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp false <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp false <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp false <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp false <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp false <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp false <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp false double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp false <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp false <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp false <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp false <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp false <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp false <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp false <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp false <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_false'
@@ -1816,54 +1816,54 @@
 define i32 @cmp_float_true(i32 %arg) {
 ; SSE2-LABEL: 'cmp_float_true'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp true float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp true <2 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp true <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp true <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp true <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp true <2 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp true <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp true <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp true <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp true double undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp true <2 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp true <4 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp true <8 x double> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp true <16 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp true <2 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp true <4 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp true <8 x double> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp true <16 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'cmp_float_true'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp true float undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp true <2 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp true <4 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp true <8 x float> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp true <16 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp true <2 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp true <4 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp true <8 x float> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp true <16 x float> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp true double undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp true <2 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp true <4 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp true <8 x double> undef, undef
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp true <16 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp true <2 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp true <4 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp true <8 x double> undef, undef
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp true <16 x double> undef, undef
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'cmp_float_true'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp true float undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp true <2 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp true <4 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp true <8 x float> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp true <16 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp true <2 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp true <4 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp true <8 x float> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp true <16 x float> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp true double undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp true <2 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp true <4 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp true <8 x double> undef, undef
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp true <16 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp true <2 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp true <4 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp true <8 x double> undef, undef
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp true <16 x double> undef, undef
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'cmp_float_true'
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = fcmp true float undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = fcmp true <2 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = fcmp true <4 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = fcmp true <8 x float> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = fcmp true <16 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fcmp true <2 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fcmp true <4 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fcmp true <8 x float> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fcmp true <16 x float> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fcmp true double undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fcmp true <2 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = fcmp true <4 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = fcmp true <8 x double> undef, undef
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F64 = fcmp true <16 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fcmp true <2 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fcmp true <4 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fcmp true <8 x double> undef, undef
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64 = fcmp true <16 x double> undef, undef
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'cmp_float_true'
diff --git a/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index d49d557..8432b91 100644
--- a/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -341,114 +341,33 @@
 }
 
 define float @maxf8(float) {
-; SSE-LABEL: @maxf8(
-; SSE-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; SSE-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; SSE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; SSE-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; SSE-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; SSE-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; SSE-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; SSE-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; SSE-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; SSE-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; SSE-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; SSE-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; SSE-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; SSE-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; SSE-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; SSE-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; SSE-NEXT:    ret float [[TMP23]]
-;
-; AVX-LABEL: @maxf8(
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
-; AVX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
-; AVX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
-; AVX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
-; AVX-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
-; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
-; AVX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
-; AVX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
-; AVX-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> [[RDX_SHUF4]]
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
-; AVX-NEXT:    ret float [[TMP16]]
-;
-; AVX2-LABEL: @maxf8(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
-; AVX2-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
-; AVX2-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
-; AVX2-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
-; AVX2-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
-; AVX2-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
-; AVX2-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
-; AVX2-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
-; AVX2-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
-; AVX2-NEXT:    ret float [[TMP16]]
-;
-; SKX-LABEL: @maxf8(
-; SKX-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
-; SKX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
-; SKX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
-; SKX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
-; SKX-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
-; SKX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
-; SKX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
-; SKX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
-; SKX-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> [[RDX_SHUF4]]
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
-; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
-; SKX-NEXT:    ret float [[TMP16]]
+; CHECK-LABEL: @maxf8(
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
+; CHECK-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
+; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
+; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
+; CHECK-NEXT:    ret float [[TMP16]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@@ -476,195 +395,52 @@
 }
 
 define float @maxf16(float) {
-; SSE-LABEL: @maxf16(
-; SSE-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
-; SSE-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
-; SSE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
-; SSE-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
-; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
-; SSE-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
-; SSE-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
-; SSE-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
-; SSE-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
-; SSE-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
-; SSE-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
-; SSE-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
-; SSE-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
-; SSE-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
-; SSE-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
-; SSE-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
-; SSE-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
-; SSE-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
-; SSE-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
-; SSE-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
-; SSE-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
-; SSE-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
-; SSE-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
-; SSE-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
-; SSE-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
-; SSE-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
-; SSE-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
-; SSE-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
-; SSE-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
-; SSE-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
-; SSE-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
-; SSE-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
-; SSE-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
-; SSE-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
-; SSE-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
-; SSE-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
-; SSE-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
-; SSE-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
-; SSE-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
-; SSE-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
-; SSE-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
-; SSE-NEXT:    ret float [[TMP47]]
-;
-; AVX-LABEL: @maxf16(
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
-; AVX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
-; AVX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
-; AVX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
-; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
-; AVX-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
-; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
-; AVX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
-; AVX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
-; AVX-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
-; AVX-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
-; AVX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP16]], undef
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP16]], float undef
-; AVX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP18]], undef
-; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP18]], float undef
-; AVX-NEXT:    [[TMP21:%.*]] = fcmp fast ogt float [[TMP20]], undef
-; AVX-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], float [[TMP20]], float undef
-; AVX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP22]], undef
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP22]], float undef
-; AVX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP24]], undef
-; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP24]], float undef
-; AVX-NEXT:    [[TMP27:%.*]] = fcmp fast ogt float [[TMP26]], undef
-; AVX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[TMP26]], float undef
-; AVX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP28]], undef
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP28]], float undef
-; AVX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP30]], undef
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> [[RDX_SHUF7]]
-; AVX-NEXT:    [[TMP32:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], float [[TMP30]], float undef
-; AVX-NEXT:    ret float [[TMP32]]
-;
-; AVX2-LABEL: @maxf16(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
-; AVX2-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
-; AVX2-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
-; AVX2-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
-; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
-; AVX2-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
-; AVX2-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
-; AVX2-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
-; AVX2-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
-; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
-; AVX2-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
-; AVX2-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
-; AVX2-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP16]], undef
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP16]], float undef
-; AVX2-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP18]], undef
-; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP18]], float undef
-; AVX2-NEXT:    [[TMP21:%.*]] = fcmp fast ogt float [[TMP20]], undef
-; AVX2-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], float [[TMP20]], float undef
-; AVX2-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP22]], undef
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP22]], float undef
-; AVX2-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP24]], undef
-; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP24]], float undef
-; AVX2-NEXT:    [[TMP27:%.*]] = fcmp fast ogt float [[TMP26]], undef
-; AVX2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[TMP26]], float undef
-; AVX2-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP28]], undef
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP28]], float undef
-; AVX2-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP30]], undef
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> [[RDX_SHUF7]]
-; AVX2-NEXT:    [[TMP32:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], float [[TMP30]], float undef
-; AVX2-NEXT:    ret float [[TMP32]]
-;
-; SKX-LABEL: @maxf16(
-; SKX-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
-; SKX-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
-; SKX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
-; SKX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
-; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
-; SKX-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
-; SKX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
-; SKX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
-; SKX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
-; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
-; SKX-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
-; SKX-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
-; SKX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP16]], undef
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP16]], float undef
-; SKX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP18]], undef
-; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP18]], float undef
-; SKX-NEXT:    [[TMP21:%.*]] = fcmp fast ogt float [[TMP20]], undef
-; SKX-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], float [[TMP20]], float undef
-; SKX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP22]], undef
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP22]], float undef
-; SKX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP24]], undef
-; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP24]], float undef
-; SKX-NEXT:    [[TMP27:%.*]] = fcmp fast ogt float [[TMP26]], undef
-; SKX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[TMP26]], float undef
-; SKX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP28]], undef
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP28]], float undef
-; SKX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP30]], undef
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> [[RDX_SHUF7]]
-; SKX-NEXT:    [[TMP32:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
-; SKX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], float [[TMP30]], float undef
-; SKX-NEXT:    ret float [[TMP32]]
+; CHECK-LABEL: @maxf16(
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
+; CHECK-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
+; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
+; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
+; CHECK-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP16]], undef
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP16]], float undef
+; CHECK-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP18]], undef
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP18]], float undef
+; CHECK-NEXT:    [[TMP21:%.*]] = fcmp fast ogt float [[TMP20]], undef
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], float [[TMP20]], float undef
+; CHECK-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP22]], undef
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP22]], float undef
+; CHECK-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP24]], undef
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP24]], float undef
+; CHECK-NEXT:    [[TMP27:%.*]] = fcmp fast ogt float [[TMP26]], undef
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[TMP26]], float undef
+; CHECK-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP28]], undef
+; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP28]], float undef
+; CHECK-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP30]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], float [[TMP30]], float undef
+; CHECK-NEXT:    ret float [[TMP32]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4