[CodeGen] Handle vector UADDO, SADDO, USUBO, SSUBO

This is part of https://bugs.llvm.org/show_bug.cgi?id=40442.

Vector legalization is implemented for the add/sub overflow opcodes.
UMULO/SMULO are also handled as far as legalization is concerned, but
they don't support vector expansion yet (so no tests for them).

The vector result widening implementation is suboptimal, because it
could result in a legalization loop.

Differential Revision: https://reviews.llvm.org/D57639

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@353464 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 9873b1e..aee4194 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -881,7 +881,7 @@
 
   // Calculate the overflow flag: zero extend the arithmetic result from
   // the original type.
-  SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT);
+  SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT.getScalarType());
   // Overflowed if and only if this is not equal to Res.
   Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a2a8d16..ace46e9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -674,6 +674,7 @@
   SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
   SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
   SDValue ScalarizeVecRes_StrictFPOp(SDNode *N);
+  SDValue ScalarizeVecRes_OverflowOp(SDNode *N, unsigned ResNo);
   SDValue ScalarizeVecRes_InregOp(SDNode *N);
   SDValue ScalarizeVecRes_VecInregOp(SDNode *N);
 
@@ -728,6 +729,8 @@
   void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
+                              SDValue &Lo, SDValue &Hi);
 
   void SplitVecRes_MULFIX(SDNode *N, SDValue &Lo, SDValue &Hi);
 
@@ -809,6 +812,7 @@
   SDValue WidenVecRes_Binary(SDNode *N);
   SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
   SDValue WidenVecRes_StrictFP(SDNode *N);
+  SDValue WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo);
   SDValue WidenVecRes_Convert(SDNode *N);
   SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
   SDValue WidenVecRes_POWI(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 32876bf..f5ebb56 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -171,6 +171,14 @@
   case ISD::STRICT_FTRUNC:
     R = ScalarizeVecRes_StrictFPOp(N);
     break;
+  case ISD::UADDO:
+  case ISD::SADDO:
+  case ISD::USUBO:
+  case ISD::SSUBO:
+  case ISD::UMULO:
+  case ISD::SMULO:
+    R = ScalarizeVecRes_OverflowOp(N, ResNo);
+    break;
   case ISD::SMULFIX:
   case ISD::UMULFIX:
     R = ScalarizeVecRes_MULFIX(N);
@@ -235,6 +243,43 @@
   return Result;
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N,
+                                                     unsigned ResNo) {
+  SDLoc DL(N);
+  EVT ResVT = N->getValueType(0);
+  EVT OvVT = N->getValueType(1);
+
+  SDValue ScalarLHS, ScalarRHS;
+  if (getTypeAction(ResVT) == TargetLowering::TypeScalarizeVector) {
+    ScalarLHS = GetScalarizedVector(N->getOperand(0));
+    ScalarRHS = GetScalarizedVector(N->getOperand(1));
+  } else {
+    SmallVector<SDValue, 1> ElemsLHS, ElemsRHS;
+    DAG.ExtractVectorElements(N->getOperand(0), ElemsLHS);
+    DAG.ExtractVectorElements(N->getOperand(1), ElemsRHS);
+    ScalarLHS = ElemsLHS[0];
+    ScalarRHS = ElemsRHS[0];
+  }
+
+  SDVTList ScalarVTs = DAG.getVTList(
+      ResVT.getVectorElementType(), OvVT.getVectorElementType());
+  SDNode *ScalarNode = DAG.getNode(
+      N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode();
+
+  // Replace the other vector result not being explicitly scalarized here.
+  unsigned OtherNo = 1 - ResNo;
+  EVT OtherVT = N->getValueType(OtherNo);
+  if (getTypeAction(OtherVT) == TargetLowering::TypeScalarizeVector) {
+    SetScalarizedVector(SDValue(N, OtherNo), SDValue(ScalarNode, OtherNo));
+  } else {
+    SDValue OtherVal = DAG.getNode(
+        ISD::SCALAR_TO_VECTOR, DL, OtherVT, SDValue(ScalarNode, OtherNo));
+    ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
+  }
+
+  return SDValue(ScalarNode, ResNo);
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
                                                        unsigned ResNo) {
   SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
@@ -859,6 +904,14 @@
   case ISD::STRICT_FTRUNC:
     SplitVecRes_StrictFPOp(N, Lo, Hi);
     break;
+  case ISD::UADDO:
+  case ISD::SADDO:
+  case ISD::USUBO:
+  case ISD::SSUBO:
+  case ISD::UMULO:
+  case ISD::SMULO:
+    SplitVecRes_OverflowOp(N, ResNo, Lo, Hi);
+    break;
   case ISD::SMULFIX:
   case ISD::UMULFIX:
     SplitVecRes_MULFIX(N, Lo, Hi);
@@ -1205,6 +1258,47 @@
   ReplaceValueWith(SDValue(N, 1), Chain);
 }
 
+void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
+                                              SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  EVT ResVT = N->getValueType(0);
+  EVT OvVT = N->getValueType(1);
+  EVT LoResVT, HiResVT, LoOvVT, HiOvVT;
+  std::tie(LoResVT, HiResVT) = DAG.GetSplitDestVTs(ResVT);
+  std::tie(LoOvVT, HiOvVT) = DAG.GetSplitDestVTs(OvVT);
+
+  SDValue LoLHS, HiLHS, LoRHS, HiRHS;
+  if (getTypeAction(ResVT) == TargetLowering::TypeSplitVector) {
+    GetSplitVector(N->getOperand(0), LoLHS, HiLHS);
+    GetSplitVector(N->getOperand(1), LoRHS, HiRHS);
+  } else {
+    std::tie(LoLHS, HiLHS) = DAG.SplitVectorOperand(N, 0);
+    std::tie(LoRHS, HiRHS) = DAG.SplitVectorOperand(N, 1);
+  }
+
+  unsigned Opcode = N->getOpcode();
+  SDVTList LoVTs = DAG.getVTList(LoResVT, LoOvVT);
+  SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT);
+  SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode();
+  SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode();
+
+  Lo = SDValue(LoNode, ResNo);
+  Hi = SDValue(HiNode, ResNo);
+
+  // Replace the other vector result not being explicitly split here.
+  unsigned OtherNo = 1 - ResNo;
+  EVT OtherVT = N->getValueType(OtherNo);
+  if (getTypeAction(OtherVT) == TargetLowering::TypeSplitVector) {
+    SetSplitVector(SDValue(N, OtherNo),
+                   SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo));
+  } else {
+    SDValue OtherVal = DAG.getNode(
+        ISD::CONCAT_VECTORS, dl, OtherVT,
+        SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo));
+    ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
+  }
+}
+
 void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
                                                      SDValue &Hi) {
   SDValue Vec = N->getOperand(0);
@@ -2471,6 +2565,15 @@
     Res = WidenVecRes_StrictFP(N);
     break;
 
+  case ISD::UADDO:
+  case ISD::SADDO:
+  case ISD::USUBO:
+  case ISD::SSUBO:
+  case ISD::UMULO:
+  case ISD::SMULO:
+    Res = WidenVecRes_OverflowOp(N, ResNo);
+    break;
+
   case ISD::FCOPYSIGN:
     Res = WidenVecRes_FCOPYSIGN(N);
     break;
@@ -2845,6 +2948,58 @@
   return CollectOpsToWiden(DAG, TLI, ConcatOps, ConcatEnd, VT, MaxVT, WidenVT);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo) {
+  SDLoc DL(N);
+  EVT ResVT = N->getValueType(0);
+  EVT OvVT = N->getValueType(1);
+  EVT WideResVT, WideOvVT;
+  SDValue WideLHS, WideRHS;
+
+  // TODO: This might result in a widen/split loop.
+  if (ResNo == 0) {
+    WideResVT = TLI.getTypeToTransformTo(*DAG.getContext(), ResVT);
+    WideOvVT = EVT::getVectorVT(
+        *DAG.getContext(), OvVT.getVectorElementType(),
+        WideResVT.getVectorNumElements());
+
+    WideLHS = GetWidenedVector(N->getOperand(0));
+    WideRHS = GetWidenedVector(N->getOperand(1));
+  } else {
+    WideOvVT = TLI.getTypeToTransformTo(*DAG.getContext(), OvVT);
+    WideResVT = EVT::getVectorVT(
+        *DAG.getContext(), ResVT.getVectorElementType(),
+        WideOvVT.getVectorNumElements());
+
+    SDValue Zero = DAG.getConstant(
+        0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()));
+    WideLHS = DAG.getNode(
+        ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT),
+        N->getOperand(0), Zero);
+    WideRHS = DAG.getNode(
+        ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT),
+        N->getOperand(1), Zero);
+  }
+
+  SDVTList WideVTs = DAG.getVTList(WideResVT, WideOvVT);
+  SDNode *WideNode = DAG.getNode(
+      N->getOpcode(), DL, WideVTs, WideLHS, WideRHS).getNode();
+
+  // Replace the other vector result not being explicitly widened here.
+  unsigned OtherNo = 1 - ResNo;
+  EVT OtherVT = N->getValueType(OtherNo);
+  if (getTypeAction(OtherVT) == TargetLowering::TypeWidenVector) {
+    SetWidenedVector(SDValue(N, OtherNo), SDValue(WideNode, OtherNo));
+  } else {
+    SDValue Zero = DAG.getConstant(
+        0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()));
+    SDValue OtherVal = DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, DL, OtherVT, SDValue(WideNode, OtherNo), Zero);
+    ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
+  }
+
+  return SDValue(WideNode, ResNo);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   SDValue InOp = N->getOperand(0);
   SDLoc DL(N);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c7c32b4..b205e97 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6113,7 +6113,13 @@
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
 
-    SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
+    EVT ResultVT = Op1.getValueType();
+    EVT OverflowVT = MVT::i1;
+    if (ResultVT.isVector())
+      OverflowVT = EVT::getVectorVT(
+          *Context, OverflowVT, ResultVT.getVectorNumElements());
+
+    SDVTList VTs = DAG.getVTList(ResultVT, OverflowVT);
     setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));
     return nullptr;
   }
diff --git a/test/CodeGen/AArch64/vec_uaddo.ll b/test/CodeGen/AArch64/vec_uaddo.ll
new file mode 100644
index 0000000..6c2f07f
--- /dev/null
+++ b/test/CodeGen/AArch64/vec_uaddo.ll
@@ -0,0 +1,319 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
+
+declare {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32>, <1 x i32>)
+declare {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>)
+declare {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32>, <3 x i32>)
+declare {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32>, <6 x i32>)
+declare {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+
+declare {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+declare {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24>, <4 x i24>)
+declare {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+declare {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128>, <2 x i128>)
+
+define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v1.2s, v0.2s, v1.2s
+; CHECK-NEXT:    cmhi v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    str s1, [x0]
+; CHECK-NEXT:    ret
+  %t = call {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
+  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
+  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
+  %res = sext <1 x i1> %obit to <1 x i32>
+  store <1 x i32> %val, <1 x i32>* %p2
+  ret <1 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v1.2s, v0.2s, v1.2s
+; CHECK-NEXT:    cmhi v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    str d1, [x0]
+; CHECK-NEXT:    ret
+  %t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
+  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i32> %val, <2 x i32>* %p2
+  ret <2 x i32> %res
+}
+
+define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    add x8, x0, #8 // =8
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    st1 { v1.s }[2], [x8]
+; CHECK-NEXT:    str d1, [x0]
+; CHECK-NEXT:    ret
+  %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
+  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
+  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
+  %res = sext <3 x i1> %obit to <3 x i32>
+  store <3 x i32> %val, <3 x i32>* %p2
+  ret <3 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    str q1, [x0]
+; CHECK-NEXT:    ret
+  %t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
+  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i32> %val, <4 x i32>* %p2
+  ret <4 x i32> %res
+}
+
+define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v6i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w6
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov v0.s[1], w7
+; CHECK-NEXT:    ldr s2, [sp, #16]
+; CHECK-NEXT:    ld1 { v0.s }[2], [x8]
+; CHECK-NEXT:    add x9, sp, #8 // =8
+; CHECK-NEXT:    add x10, sp, #24 // =24
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    ld1 { v2.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v0.s }[3], [x9]
+; CHECK-NEXT:    mov v1.s[1], w1
+; CHECK-NEXT:    fmov s3, w4
+; CHECK-NEXT:    ldr x11, [sp, #32]
+; CHECK-NEXT:    mov v1.s[2], w2
+; CHECK-NEXT:    mov v3.s[1], w5
+; CHECK-NEXT:    mov v1.s[3], w3
+; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    cmhi v3.4s, v3.4s, v2.4s
+; CHECK-NEXT:    cmhi v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    str d2, [x11, #16]
+; CHECK-NEXT:    xtn v2.4h, v3.4s
+; CHECK-NEXT:    xtn v1.4h, v1.4s
+; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    mov w5, v2.s[1]
+; CHECK-NEXT:    mov w1, v1.s[1]
+; CHECK-NEXT:    mov w2, v1.s[2]
+; CHECK-NEXT:    mov w3, v1.s[3]
+; CHECK-NEXT:    fmov w4, s2
+; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    str q0, [x11]
+; CHECK-NEXT:    ret
+  %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
+  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
+  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
+  %res = sext <6 x i1> %obit to <6 x i32>
+  store <6 x i32> %val, <6 x i32>* %p2
+  ret <6 x i32> %res
+}
+
+define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
+; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    cmhi v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    xtn v1.4h, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    stp q2, q3, [x0]
+; CHECK-NEXT:    ret
+  %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
+  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
+  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
+  %res = sext <8 x i1> %obit to <8 x i32>
+  store <8 x i32> %val, <8 x i32>* %p2
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v4.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhi v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    zip1 v1.8b, v0.8b, v0.8b
+; CHECK-NEXT:    zip2 v2.8b, v0.8b, v0.8b
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    zip1 v3.8b, v0.8b, v0.8b
+; CHECK-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; CHECK-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-NEXT:    shl v2.4s, v2.4s, #31
+; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-NEXT:    sshr v0.4s, v1.4s, #31
+; CHECK-NEXT:    sshr v1.4s, v2.4s, #31
+; CHECK-NEXT:    shl v2.4s, v3.4s, #31
+; CHECK-NEXT:    shl v3.4s, v5.4s, #31
+; CHECK-NEXT:    sshr v2.4s, v2.4s, #31
+; CHECK-NEXT:    sshr v3.4s, v3.4s, #31
+; CHECK-NEXT:    str q4, [x0]
+; CHECK-NEXT:    ret
+  %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
+  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
+  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
+  %res = sext <16 x i1> %obit to <16 x i32>
+  store <16 x i8> %val, <16 x i8>* %p2
+  ret <16 x i32> %res
+}
+
+define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v2.8h, v0.8h, v1.8h
+; CHECK-NEXT:    cmhi v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    zip1 v1.8b, v0.8b, v0.8b
+; CHECK-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-NEXT:    shl v3.4s, v0.4s, #31
+; CHECK-NEXT:    sshr v0.4s, v1.4s, #31
+; CHECK-NEXT:    sshr v1.4s, v3.4s, #31
+; CHECK-NEXT:    str q2, [x0]
+; CHECK-NEXT:    ret
+  %t = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
+  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
+  %res = sext <8 x i1> %obit to <8 x i32>
+  store <8 x i16> %val, <8 x i16>* %p2
+  ret <8 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v1.2d, v0.2d, v1.2d
+; CHECK-NEXT:    cmhi v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    str q1, [x0]
+; CHECK-NEXT:    ret
+  %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
+  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i64> %val, <2 x i64>* %p2
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v4i24:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v1.4s, #255, lsl #24
+; CHECK-NEXT:    bic v0.4s, #255, lsl #24
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    mov w8, v0.s[3]
+; CHECK-NEXT:    bic v1.4s, #255, lsl #24
+; CHECK-NEXT:    mov w9, v0.s[2]
+; CHECK-NEXT:    mov w10, v0.s[1]
+; CHECK-NEXT:    sturh w8, [x0, #9]
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    cmeq v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    strh w9, [x0, #6]
+; CHECK-NEXT:    sturh w10, [x0, #3]
+; CHECK-NEXT:    lsr w9, w9, #16
+; CHECK-NEXT:    lsr w10, w10, #16
+; CHECK-NEXT:    strb w8, [x0, #11]
+; CHECK-NEXT:    mvn v0.16b, v1.16b
+; CHECK-NEXT:    lsr w8, w11, #16
+; CHECK-NEXT:    strh w11, [x0]
+; CHECK-NEXT:    strb w9, [x0, #8]
+; CHECK-NEXT:    strb w10, [x0, #5]
+; CHECK-NEXT:    strb w8, [x0, #2]
+; CHECK-NEXT:    ret
+  %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
+  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i24> %val, <4 x i24>* %p2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.4h, #1
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    add v1.4h, v0.4h, v1.4h
+; CHECK-NEXT:    umov w9, v1.h[1]
+; CHECK-NEXT:    umov w8, v1.h[0]
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    bfi w8, w9, #1, #1
+; CHECK-NEXT:    umov w9, v1.h[2]
+; CHECK-NEXT:    and v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    cmeq v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    bfi w8, w9, #2, #1
+; CHECK-NEXT:    umov w9, v1.h[3]
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    bfi w8, w9, #3, #29
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    and w8, w8, #0xf
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+  %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
+  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i1> %val, <4 x i1>* %p2
+  ret <4 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
+; CHECK-LABEL: uaddo_v2i128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adds x9, x2, x6
+; CHECK-NEXT:    adcs x10, x3, x7
+; CHECK-NEXT:    cmp x9, x2
+; CHECK-NEXT:    cset w11, lo
+; CHECK-NEXT:    cmp x10, x3
+; CHECK-NEXT:    cset w12, lo
+; CHECK-NEXT:    csel w11, w11, w12, eq
+; CHECK-NEXT:    adds x12, x0, x4
+; CHECK-NEXT:    adcs x13, x1, x5
+; CHECK-NEXT:    cmp x12, x0
+; CHECK-NEXT:    cset w14, lo
+; CHECK-NEXT:    cmp x13, x1
+; CHECK-NEXT:    cset w15, lo
+; CHECK-NEXT:    csel w14, w14, w15, eq
+; CHECK-NEXT:    ldr x8, [sp]
+; CHECK-NEXT:    fmov s0, w14
+; CHECK-NEXT:    mov v0.s[1], w11
+; CHECK-NEXT:    shl v0.2s, v0.2s, #31
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #31
+; CHECK-NEXT:    stp x9, x10, [x8, #16]
+; CHECK-NEXT:    stp x12, x13, [x8]
+; CHECK-NEXT:    ret
+  %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
+  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i128> %val, <2 x i128>* %p2
+  ret <2 x i32> %res
+}
diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll
index 0b52821..a5d7592 100644
--- a/test/CodeGen/AMDGPU/saddo.ll
+++ b/test/CodeGen/AMDGPU/saddo.ll
@@ -1,11 +1,14 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+
 
 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 
+
+declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
 ; FUNC-LABEL: {{^}}saddo_i64_zext:
 define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
@@ -65,3 +68,22 @@
   store i1 %carry, i1 addrspace(1)* %carryout
   ret void
 }
+
+; FUNC-LABEL: {{^}}v_saddo_v2i32:
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_add_{{[iu]}}32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_add_{{[iu]}}32
+define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+  %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
+  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
+  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
+  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  %carry.ext = zext <2 x i1> %carry to <2 x i32>
+  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll
index fee14b4..4f4bab8 100644
--- a/test/CodeGen/AMDGPU/ssubo.ll
+++ b/test/CodeGen/AMDGPU/ssubo.ll
@@ -1,10 +1,11 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+
 
 declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
 ; FUNC-LABEL: {{^}}ssubo_i64_zext:
 define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
@@ -70,3 +71,22 @@
   store i1 %carry, i1 addrspace(1)* %carryout
   ret void
 }
+
+; FUNC-LABEL: {{^}}v_ssubo_v2i32:
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_sub_{{[iu]}}32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_sub_{{[iu]}}32
+define amdgpu_kernel void @v_ssubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+  %sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
+  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
+  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
+  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  %carry.ext = zext <2 x i1> %carry to <2 x i32>
+  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll
index 0cb2487..cd9ea4d 100644
--- a/test/CodeGen/AMDGPU/uaddo.ll
+++ b/test/CodeGen/AMDGPU/uaddo.ll
@@ -1,7 +1,6 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
 
 ; FUNC-LABEL: {{^}}s_uaddo_i64_zext:
 ; GCN: s_add_u32
@@ -152,10 +151,32 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_uaddo_v2i32:
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_add_{{[iu]}}32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_cmp_lt_i32
+; SICIVI: v_add_{{[iu]}}32
+define amdgpu_kernel void @v_uaddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+  %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
+  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
+  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
+  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  %carry.ext = zext <2 x i1> %carry to <2 x i32>
+  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
 declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1
+declare { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll
index eeb19f8..f4e1a9a 100644
--- a/test/CodeGen/AMDGPU/usubo.ll
+++ b/test/CodeGen/AMDGPU/usubo.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SICIVI,FUNC %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
+
 
 ; FUNC-LABEL: {{^}}s_usubo_i64_zext:
 ; GCN: s_sub_u32
@@ -159,10 +159,28 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_usubo_v2i32:
+; SICIVI: v_sub_{{[iu]}}32
+; SICIVI: v_cndmask_b32
+; SICIVI: v_sub_{{[iu]}}32
+; SICIVI: v_cndmask_b32
+define amdgpu_kernel void @v_usubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+  %sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
+  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
+  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
+  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  %carry.ext = zext <2 x i1> %carry to <2 x i32>
+  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare { i16, i1 } @llvm.usub.with.overflow.i16(i16, i16) #1
 declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
 declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1
+declare { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/X86/vec_saddo.ll b/test/CodeGen/X86/vec_saddo.ll
new file mode 100644
index 0000000..dbdc412
--- /dev/null
+++ b/test/CodeGen/X86/vec_saddo.ll
@@ -0,0 +1,2028 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32>, <1 x i32>)
+declare {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>)
+declare {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32>, <3 x i32>)
+declare {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32>, <6 x i32>)
+declare {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+declare {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24>, <4 x i24>)
+declare {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+declare {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128>, <2 x i128>)
+
+define <1 x i32> @saddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
+; SSE-LABEL: saddo_v1i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    addl %esi, %edi
+; SSE-NEXT:    seto %al
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    movl %edi, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: saddo_v1i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    addl %esi, %edi
+; AVX-NEXT:    seto %al
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    movl %edi, (%rdx)
+; AVX-NEXT:    retq
+  %t = call {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
+  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
+  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
+  %res = sext <1 x i1> %obit to <1 x i32>
+  store <1 x i32> %val, <1 x i32>* %p2
+  ret <1 x i32> %res
+}
+
+define <2 x i32> @saddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
+; SSE2-LABEL: saddo_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $32, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    movq %xmm1, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: saddo_v2i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    psllq $32, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    psllq $32, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    paddq %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    psllq $32, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    movq %xmm1, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: saddo_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psllq $32, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    paddq %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllq $32, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE41-NEXT:    movq %xmm1, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vmovq %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT:    vmovq %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT:    vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %t = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
+  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i32> %val, <2 x i32>* %p2
+  ret <2 x i32> %res
+}
+
+define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
+; SSE2-LABEL: saddo_v3i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    movq %xmm0, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, 8(%rdi)
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: saddo_v3i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT:    pandn %xmm3, %xmm2
+; SSSE3-NEXT:    movq %xmm0, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm0, 8(%rdi)
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: saddo_v3i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT:    paddd %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT:    pandn %xmm3, %xmm2
+; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi)
+; SSE41-NEXT:    movq %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v3i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX1-NEXT:    vmovq %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v3i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX2-NEXT:    vmovq %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v3i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX512-NEXT:    vmovq %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
+  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
+  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
+  %res = sext <3 x i1> %obit to <3 x i32>
+  store <3 x i32> %val, <3 x i32>* %p2
+  ret <3 x i32> %res
+}
+
+define <4 x i32> @saddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
+; SSE-LABEL: saddo_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE-NEXT:    pxor %xmm4, %xmm3
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE-NEXT:    pxor %xmm4, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm4, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE-NEXT:    pandn %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
+  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i32> %val, <4 x i32>* %p2
+  ret <4 x i32> %res
+}
+
+define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
+; SSE2-LABEL: saddo_v6i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    movd %r8d, %xmm0
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    movd %esi, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movd %r9d, %xmm3
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm7
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm2
+; SSE2-NEXT:    movq %xmm1, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    movq %xmm2, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm4, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: saddo_v6i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movq %rdi, %rax
+; SSSE3-NEXT:    movd %r8d, %xmm0
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd %edx, %xmm0
+; SSSE3-NEXT:    movd %esi, %xmm4
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movd %r9d, %xmm3
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm6, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSSE3-NEXT:    pxor %xmm5, %xmm6
+; SSSE3-NEXT:    pxor %xmm7, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSSE3-NEXT:    pxor %xmm5, %xmm7
+; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSSE3-NEXT:    paddd %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm5, %xmm4
+; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm4
+; SSSE3-NEXT:    pandn %xmm6, %xmm4
+; SSSE3-NEXT:    pxor %xmm6, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm6
+; SSSE3-NEXT:    pxor %xmm7, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm7
+; SSSE3-NEXT:    pxor %xmm5, %xmm7
+; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSSE3-NEXT:    paddd %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm2
+; SSSE3-NEXT:    pandn %xmm6, %xmm2
+; SSSE3-NEXT:    movq %xmm1, 16(%rcx)
+; SSSE3-NEXT:    movdqa %xmm0, (%rcx)
+; SSSE3-NEXT:    movq %xmm2, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: saddo_v6i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq %rdi, %rax
+; SSE41-NEXT:    movd %esi, %xmm4
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm4
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm4
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm4
+; SSE41-NEXT:    movd %r9d, %xmm2
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
+; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pxor %xmm6, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    pxor %xmm7, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT:    paddd %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm4
+; SSE41-NEXT:    pandn %xmm6, %xmm4
+; SSE41-NEXT:    pxor %xmm6, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    pxor %xmm7, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT:    paddd %xmm2, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm5, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm3
+; SSE41-NEXT:    pandn %xmm6, %xmm3
+; SSE41-NEXT:    movq %xmm0, 16(%rcx)
+; SSE41-NEXT:    movdqa %xmm1, (%rcx)
+; SSE41-NEXT:    movq %xmm3, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm4, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v6i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm8
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm4, %xmm9
+; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
+; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v6i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v6i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
+  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
+  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
+  %res = sext <6 x i1> %obit to <6 x i32>
+  store <6 x i32> %val, <6 x i32>* %p2
+  ret <6 x i32> %res
+}
+
+define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
+; SSE-LABEL: saddo_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE-NEXT:    pxor %xmm5, %xmm6
+; SSE-NEXT:    pxor %xmm7, %xmm7
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE-NEXT:    pxor %xmm5, %xmm7
+; SSE-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm5, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm7, %xmm2
+; SSE-NEXT:    pandn %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE-NEXT:    pxor %xmm5, %xmm6
+; SSE-NEXT:    pxor %xmm7, %xmm7
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE-NEXT:    pxor %xmm5, %xmm7
+; SSE-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSE-NEXT:    paddd %xmm3, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm7, %xmm1
+; SSE-NEXT:    pandn %xmm6, %xmm1
+; SSE-NEXT:    movdqa %xmm4, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm8
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm4, %xmm9
+; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
+; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm8, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovaps %ymm2, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
+  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
+  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
+  %res = sext <8 x i1> %obit to <8 x i32>
+  store <8 x i32> %val, <8 x i32>* %p2
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
+; SSE-LABEL: saddo_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pxor %xmm11, %xmm11
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE-NEXT:    pcmpeqd %xmm10, %xmm10
+; SSE-NEXT:    pxor %xmm10, %xmm11
+; SSE-NEXT:    pxor %xmm12, %xmm12
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm12
+; SSE-NEXT:    pxor %xmm10, %xmm12
+; SSE-NEXT:    pcmpeqd %xmm12, %xmm11
+; SSE-NEXT:    paddd %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm9, %xmm9
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm9
+; SSE-NEXT:    pxor %xmm10, %xmm9
+; SSE-NEXT:    pcmpeqd %xmm12, %xmm9
+; SSE-NEXT:    pandn %xmm11, %xmm9
+; SSE-NEXT:    pxor %xmm12, %xmm12
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm12
+; SSE-NEXT:    pxor %xmm10, %xmm12
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE-NEXT:    pxor %xmm10, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm12
+; SSE-NEXT:    paddd %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm11, %xmm11
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm11
+; SSE-NEXT:    pxor %xmm10, %xmm11
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm11
+; SSE-NEXT:    pandn %xmm12, %xmm11
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE-NEXT:    pxor %xmm10, %xmm4
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm10, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE-NEXT:    paddd %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm10, %xmm6
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE-NEXT:    pandn %xmm4, %xmm6
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSE-NEXT:    pxor %xmm10, %xmm4
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm8, %xmm5
+; SSE-NEXT:    pxor %xmm10, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE-NEXT:    paddd %xmm7, %xmm8
+; SSE-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE-NEXT:    pxor %xmm10, %xmm3
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE-NEXT:    pandn %xmm4, %xmm3
+; SSE-NEXT:    movdqa %xmm8, 48(%rdi)
+; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm9, %xmm0
+; SSE-NEXT:    movdqa %xmm11, %xmm1
+; SSE-NEXT:    movdqa %xmm6, %xmm2
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm9
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm5, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm10
+; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm10, %xmm8
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm5, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm11
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm11, %xmm6, %xmm11
+; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm11, %ymm8
+; AVX1-NEXT:    vpaddd %xmm9, %xmm7, %xmm9
+; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm10, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm10
+; AVX1-NEXT:    vpcmpgtd %xmm10, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm3, %ymm3
+; AVX1-NEXT:    vandps %ymm3, %ymm8, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT:    vpackssdw %xmm6, %xmm3, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm3, %xmm11
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm12
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm12, %xmm7, %xmm12
+; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm12, %ymm11
+; AVX1-NEXT:    vpaddd %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm5, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandps %ymm2, %ymm11, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm4
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpmovsxwd %xmm8, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm8[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm4, 32(%rdi)
+; AVX1-NEXT:    vmovaps %ymm3, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm5
+; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm4, %ymm7
+; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm7, %ymm1
+; AVX2-NEXT:    vpandn %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm5
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm4, %ymm7
+; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm0
+; AVX2-NEXT:    vpxor %ymm6, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm7, %ymm0
+; AVX2-NEXT:    vpandn %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k0
+; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
+  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
+  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
+  %res = sext <16 x i1> %obit to <16 x i32>
+  store <16 x i32> %val, <16 x i32>* %p2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
+; SSE2-LABEL: saddo_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpeqb %xmm5, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pcmpeqb %xmm5, %xmm3
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    pslld $31, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    pslld $31, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm3
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: saddo_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtb %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm2
+; SSSE3-NEXT:    paddb %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpgtb %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm3
+; SSSE3-NEXT:    pandn %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSSE3-NEXT:    pslld $31, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    pslld $31, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm3
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: saddo_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    pcmpgtb %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    pcmpeqb %xmm5, %xmm2
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtb %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    pcmpeqb %xmm5, %xmm3
+; SSE41-NEXT:    pandn %xmm2, %xmm3
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm4
+; SSE41-NEXT:    psrad $31, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpgtb %xmm6, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovdqa %xmm6, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm5
+; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm6
+; AVX2-NEXT:    vpcmpgtb %xmm6, %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm1
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa %xmm6, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
+  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
+  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
+  %res = sext <16 x i1> %obit to <16 x i32>
+  store <16 x i8> %val, <16 x i8>* %p2
+  ret <16 x i32> %res
+}
+
+define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
+; SSE2-LABEL: saddo_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtw %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpeqw %xmm5, %xmm3
+; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pcmpeqw %xmm5, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    pslld $31, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: saddo_v8i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtw %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpeqw %xmm5, %xmm3
+; SSSE3-NEXT:    paddw %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    pcmpeqw %xmm5, %xmm1
+; SSSE3-NEXT:    pandn %xmm3, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    pslld $31, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: saddo_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pcmpgtw %xmm2, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    pcmpgtw %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    pcmpeqw %xmm5, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm0
+; SSE41-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm5, %xmm1
+; SSE41-NEXT:    pandn %xmm3, %xmm1
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    pslld $31, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pslld $31, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm5
+; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
+  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
+  %res = sext <8 x i1> %obit to <8 x i32>
+  store <8 x i16> %val, <8 x i16>* %p2
+  ret <8 x i32> %res
+}
+
+define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
+; SSE2-LABEL: saddo_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm1, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: saddo_v2i64:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    paddq %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm5
+; SSSE3-NEXT:    pxor %xmm1, %xmm5
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm2
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pandn %xmm3, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: saddo_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    paddq %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm1, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm5
+; SSE41-NEXT:    pxor %xmm1, %xmm5
+; SSE41-NEXT:    pcmpeqq %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqq %xmm5, %xmm0
+; SSE41-NEXT:    pandn %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandnw %k1, %k0, %k1
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    retq
+  %t = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
+  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i64> %val, <2 x i64>* %p2
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
+; SSE2-LABEL: saddo_v4i24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pslld $8, %xmm1
+; SSE2-NEXT:    psrad $8, %xmm1
+; SSE2-NEXT:    pslld $8, %xmm2
+; SSE2-NEXT:    psrad $8, %xmm2
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pslld $8, %xmm0
+; SSE2-NEXT:    psrad $8, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    movw %ax, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %ecx
+; SSE2-NEXT:    movw %cx, 9(%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, %edx
+; SSE2-NEXT:    movw %dx, 6(%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %esi
+; SSE2-NEXT:    movw %si, 3(%rdi)
+; SSE2-NEXT:    shrl $16, %eax
+; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    shrl $16, %ecx
+; SSE2-NEXT:    movb %cl, 11(%rdi)
+; SSE2-NEXT:    shrl $16, %edx
+; SSE2-NEXT:    movb %dl, 8(%rdi)
+; SSE2-NEXT:    shrl $16, %esi
+; SSE2-NEXT:    movb %sil, 5(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: saddo_v4i24:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pslld $8, %xmm1
+; SSSE3-NEXT:    psrad $8, %xmm1
+; SSSE3-NEXT:    pslld $8, %xmm2
+; SSSE3-NEXT:    psrad $8, %xmm2
+; SSSE3-NEXT:    paddd %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pslld $8, %xmm0
+; SSSE3-NEXT:    psrad $8, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    movw %ax, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %ecx
+; SSSE3-NEXT:    movw %cx, 9(%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, %edx
+; SSSE3-NEXT:    movw %dx, 6(%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %esi
+; SSSE3-NEXT:    movw %si, 3(%rdi)
+; SSSE3-NEXT:    shrl $16, %eax
+; SSSE3-NEXT:    movb %al, 2(%rdi)
+; SSSE3-NEXT:    shrl $16, %ecx
+; SSSE3-NEXT:    movb %cl, 11(%rdi)
+; SSSE3-NEXT:    shrl $16, %edx
+; SSSE3-NEXT:    movb %dl, 8(%rdi)
+; SSSE3-NEXT:    shrl $16, %esi
+; SSSE3-NEXT:    movb %sil, 5(%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: saddo_v4i24:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pslld $8, %xmm1
+; SSE41-NEXT:    psrad $8, %xmm1
+; SSE41-NEXT:    pslld $8, %xmm2
+; SSE41-NEXT:    psrad $8, %xmm2
+; SSE41-NEXT:    paddd %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pslld $8, %xmm0
+; SSE41-NEXT:    psrad $8, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrd $3, %xmm2, %eax
+; SSE41-NEXT:    movw %ax, 9(%rdi)
+; SSE41-NEXT:    pextrd $2, %xmm2, %ecx
+; SSE41-NEXT:    movw %cx, 6(%rdi)
+; SSE41-NEXT:    pextrd $1, %xmm2, %edx
+; SSE41-NEXT:    movw %dx, 3(%rdi)
+; SSE41-NEXT:    movd %xmm2, %esi
+; SSE41-NEXT:    movw %si, (%rdi)
+; SSE41-NEXT:    shrl $16, %eax
+; SSE41-NEXT:    movb %al, 11(%rdi)
+; SSE41-NEXT:    shrl $16, %ecx
+; SSE41-NEXT:    movb %cl, 8(%rdi)
+; SSE41-NEXT:    shrl $16, %edx
+; SSE41-NEXT:    movb %dl, 5(%rdi)
+; SSE41-NEXT:    shrl $16, %esi
+; SSE41-NEXT:    movb %sil, 2(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v4i24:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpslld $8, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX1-NEXT:    movw %ax, 9(%rdi)
+; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT:    movw %cx, 6(%rdi)
+; AVX1-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX1-NEXT:    movw %dx, 3(%rdi)
+; AVX1-NEXT:    vmovd %xmm1, %esi
+; AVX1-NEXT:    movw %si, (%rdi)
+; AVX1-NEXT:    shrl $16, %eax
+; AVX1-NEXT:    movb %al, 11(%rdi)
+; AVX1-NEXT:    shrl $16, %ecx
+; AVX1-NEXT:    movb %cl, 8(%rdi)
+; AVX1-NEXT:    shrl $16, %edx
+; AVX1-NEXT:    movb %dl, 5(%rdi)
+; AVX1-NEXT:    shrl $16, %esi
+; AVX1-NEXT:    movb %sil, 2(%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v4i24:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $8, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $8, %xmm1, %xmm1
+; AVX2-NEXT:    vpslld $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpslld $8, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX2-NEXT:    movw %ax, 9(%rdi)
+; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT:    movw %cx, 6(%rdi)
+; AVX2-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX2-NEXT:    movw %dx, 3(%rdi)
+; AVX2-NEXT:    vmovd %xmm1, %esi
+; AVX2-NEXT:    movw %si, (%rdi)
+; AVX2-NEXT:    shrl $16, %eax
+; AVX2-NEXT:    movb %al, 11(%rdi)
+; AVX2-NEXT:    shrl $16, %ecx
+; AVX2-NEXT:    movb %cl, 8(%rdi)
+; AVX2-NEXT:    shrl $16, %edx
+; AVX2-NEXT:    movb %dl, 5(%rdi)
+; AVX2-NEXT:    shrl $16, %esi
+; AVX2-NEXT:    movb %sil, 2(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v4i24:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrad $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpslld $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0
+; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX512-NEXT:    movw %ax, 9(%rdi)
+; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX512-NEXT:    movw %cx, 6(%rdi)
+; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX512-NEXT:    movw %dx, 3(%rdi)
+; AVX512-NEXT:    vmovd %xmm1, %esi
+; AVX512-NEXT:    movw %si, (%rdi)
+; AVX512-NEXT:    shrl $16, %eax
+; AVX512-NEXT:    movb %al, 11(%rdi)
+; AVX512-NEXT:    shrl $16, %ecx
+; AVX512-NEXT:    movb %cl, 8(%rdi)
+; AVX512-NEXT:    shrl $16, %edx
+; AVX512-NEXT:    movb %dl, 5(%rdi)
+; AVX512-NEXT:    shrl $16, %esi
+; AVX512-NEXT:    movb %sil, 2(%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
+  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i24> %val, <4 x i24>* %p2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
+; SSE-LABEL: saddo_v4i1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $31, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pslld $31, %xmm0
+; SSE-NEXT:    psrad $31, %xmm0
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pslld $31, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movmskps %xmm1, %eax
+; SSE-NEXT:    movb %al, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v4i1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovmskps %xmm1, %eax
+; AVX1-NEXT:    movb %al, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v4i1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovmskps %xmm1, %eax
+; AVX2-NEXT:    movb %al, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v4i1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vpslld $31, %xmm1, %xmm0
+; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k3
+; AVX512-NEXT:    kxorw %k2, %k0, %k0
+; AVX512-NEXT:    kxorw %k0, %k1, %k1
+; AVX512-NEXT:    kandnw %k3, %k1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
+  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i1> %val, <4 x i1>* %p2
+  ret <4 x i32> %res
+}
+
+define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
+; SSE2-LABEL: saddo_v2i128:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    adcq %r11, %rax
+; SSE2-NEXT:    setns %bl
+; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    setns %cl
+; SSE2-NEXT:    cmpb %bl, %cl
+; SSE2-NEXT:    setne %bpl
+; SSE2-NEXT:    testq %r11, %r11
+; SSE2-NEXT:    setns %bl
+; SSE2-NEXT:    cmpb %bl, %cl
+; SSE2-NEXT:    sete %cl
+; SSE2-NEXT:    andb %bpl, %cl
+; SSE2-NEXT:    movzbl %cl, %ebp
+; SSE2-NEXT:    testq %r9, %r9
+; SSE2-NEXT:    setns %bl
+; SSE2-NEXT:    testq %rsi, %rsi
+; SSE2-NEXT:    setns %cl
+; SSE2-NEXT:    cmpb %bl, %cl
+; SSE2-NEXT:    sete %r11b
+; SSE2-NEXT:    addq %r8, %rdi
+; SSE2-NEXT:    adcq %r9, %rsi
+; SSE2-NEXT:    setns %bl
+; SSE2-NEXT:    cmpb %bl, %cl
+; SSE2-NEXT:    setne %cl
+; SSE2-NEXT:    andb %r11b, %cl
+; SSE2-NEXT:    movzbl %cl, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    pinsrw $4, %ebp, %xmm0
+; SSE2-NEXT:    movq %rdx, 16(%r10)
+; SSE2-NEXT:    movq %rdi, (%r10)
+; SSE2-NEXT:    movq %rax, 24(%r10)
+; SSE2-NEXT:    movq %rsi, 8(%r10)
+; SSE2-NEXT:    psllq $63, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: saddo_v2i128:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pushq %rbp
+; SSSE3-NEXT:    pushq %rbx
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSSE3-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; SSSE3-NEXT:    movq %rcx, %rax
+; SSSE3-NEXT:    adcq %r11, %rax
+; SSSE3-NEXT:    setns %bl
+; SSSE3-NEXT:    testq %rcx, %rcx
+; SSSE3-NEXT:    setns %cl
+; SSSE3-NEXT:    cmpb %bl, %cl
+; SSSE3-NEXT:    setne %bpl
+; SSSE3-NEXT:    testq %r11, %r11
+; SSSE3-NEXT:    setns %bl
+; SSSE3-NEXT:    cmpb %bl, %cl
+; SSSE3-NEXT:    sete %cl
+; SSSE3-NEXT:    andb %bpl, %cl
+; SSSE3-NEXT:    movzbl %cl, %ebp
+; SSSE3-NEXT:    testq %r9, %r9
+; SSSE3-NEXT:    setns %bl
+; SSSE3-NEXT:    testq %rsi, %rsi
+; SSSE3-NEXT:    setns %cl
+; SSSE3-NEXT:    cmpb %bl, %cl
+; SSSE3-NEXT:    sete %r11b
+; SSSE3-NEXT:    addq %r8, %rdi
+; SSSE3-NEXT:    adcq %r9, %rsi
+; SSSE3-NEXT:    setns %bl
+; SSSE3-NEXT:    cmpb %bl, %cl
+; SSSE3-NEXT:    setne %cl
+; SSSE3-NEXT:    andb %r11b, %cl
+; SSSE3-NEXT:    movzbl %cl, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    pinsrw $4, %ebp, %xmm0
+; SSSE3-NEXT:    movq %rdx, 16(%r10)
+; SSSE3-NEXT:    movq %rdi, (%r10)
+; SSSE3-NEXT:    movq %rax, 24(%r10)
+; SSSE3-NEXT:    movq %rsi, 8(%r10)
+; SSSE3-NEXT:    psllq $63, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    popq %rbx
+; SSSE3-NEXT:    popq %rbp
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: saddo_v2i128:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pushq %rbp
+; SSE41-NEXT:    pushq %rbx
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE41-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT:    movq %rcx, %rax
+; SSE41-NEXT:    adcq %r11, %rax
+; SSE41-NEXT:    setns %bl
+; SSE41-NEXT:    testq %rcx, %rcx
+; SSE41-NEXT:    setns %cl
+; SSE41-NEXT:    cmpb %bl, %cl
+; SSE41-NEXT:    setne %bpl
+; SSE41-NEXT:    testq %r11, %r11
+; SSE41-NEXT:    setns %bl
+; SSE41-NEXT:    cmpb %bl, %cl
+; SSE41-NEXT:    sete %cl
+; SSE41-NEXT:    andb %bpl, %cl
+; SSE41-NEXT:    movzbl %cl, %ebp
+; SSE41-NEXT:    testq %r9, %r9
+; SSE41-NEXT:    setns %bl
+; SSE41-NEXT:    testq %rsi, %rsi
+; SSE41-NEXT:    setns %cl
+; SSE41-NEXT:    cmpb %bl, %cl
+; SSE41-NEXT:    sete %r11b
+; SSE41-NEXT:    addq %r8, %rdi
+; SSE41-NEXT:    adcq %r9, %rsi
+; SSE41-NEXT:    setns %bl
+; SSE41-NEXT:    cmpb %bl, %cl
+; SSE41-NEXT:    setne %cl
+; SSE41-NEXT:    andb %r11b, %cl
+; SSE41-NEXT:    movzbl %cl, %ecx
+; SSE41-NEXT:    movd %ecx, %xmm0
+; SSE41-NEXT:    pinsrb $8, %ebp, %xmm0
+; SSE41-NEXT:    movq %rdx, 16(%r10)
+; SSE41-NEXT:    movq %rdi, (%r10)
+; SSE41-NEXT:    movq %rax, 24(%r10)
+; SSE41-NEXT:    movq %rsi, 8(%r10)
+; SSE41-NEXT:    psllq $63, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    popq %rbx
+; SSE41-NEXT:    popq %rbp
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: saddo_v2i128:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX1-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT:    movq %rcx, %rax
+; AVX1-NEXT:    adcq %r11, %rax
+; AVX1-NEXT:    setns %bl
+; AVX1-NEXT:    testq %rcx, %rcx
+; AVX1-NEXT:    setns %cl
+; AVX1-NEXT:    cmpb %bl, %cl
+; AVX1-NEXT:    setne %bpl
+; AVX1-NEXT:    testq %r11, %r11
+; AVX1-NEXT:    setns %bl
+; AVX1-NEXT:    cmpb %bl, %cl
+; AVX1-NEXT:    sete %cl
+; AVX1-NEXT:    andb %bpl, %cl
+; AVX1-NEXT:    movzbl %cl, %ebp
+; AVX1-NEXT:    testq %r9, %r9
+; AVX1-NEXT:    setns %bl
+; AVX1-NEXT:    testq %rsi, %rsi
+; AVX1-NEXT:    setns %cl
+; AVX1-NEXT:    cmpb %bl, %cl
+; AVX1-NEXT:    sete %r11b
+; AVX1-NEXT:    addq %r8, %rdi
+; AVX1-NEXT:    adcq %r9, %rsi
+; AVX1-NEXT:    setns %bl
+; AVX1-NEXT:    cmpb %bl, %cl
+; AVX1-NEXT:    setne %cl
+; AVX1-NEXT:    andb %r11b, %cl
+; AVX1-NEXT:    movzbl %cl, %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm0
+; AVX1-NEXT:    vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rdx, 16(%r10)
+; AVX1-NEXT:    movq %rdi, (%r10)
+; AVX1-NEXT:    movq %rax, 24(%r10)
+; AVX1-NEXT:    movq %rsi, 8(%r10)
+; AVX1-NEXT:    vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: saddo_v2i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    adcq %r11, %rax
+; AVX2-NEXT:    setns %bl
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    setns %cl
+; AVX2-NEXT:    cmpb %bl, %cl
+; AVX2-NEXT:    setne %bpl
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    setns %bl
+; AVX2-NEXT:    cmpb %bl, %cl
+; AVX2-NEXT:    sete %cl
+; AVX2-NEXT:    andb %bpl, %cl
+; AVX2-NEXT:    movzbl %cl, %ebp
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    setns %bl
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    setns %cl
+; AVX2-NEXT:    cmpb %bl, %cl
+; AVX2-NEXT:    sete %r11b
+; AVX2-NEXT:    addq %r8, %rdi
+; AVX2-NEXT:    adcq %r9, %rsi
+; AVX2-NEXT:    setns %bl
+; AVX2-NEXT:    cmpb %bl, %cl
+; AVX2-NEXT:    setne %cl
+; AVX2-NEXT:    andb %r11b, %cl
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    vmovd %ecx, %xmm0
+; AVX2-NEXT:    vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rdx, 16(%r10)
+; AVX2-NEXT:    movq %rdi, (%r10)
+; AVX2-NEXT:    movq %rax, 24(%r10)
+; AVX2-NEXT:    movq %rsi, 8(%r10)
+; AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: saddo_v2i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    movq %rcx, %r14
+; AVX512-NEXT:    adcq %r11, %r14
+; AVX512-NEXT:    setns %bl
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    setns %cl
+; AVX512-NEXT:    cmpb %bl, %cl
+; AVX512-NEXT:    setne %bl
+; AVX512-NEXT:    testq %r11, %r11
+; AVX512-NEXT:    setns %al
+; AVX512-NEXT:    cmpb %al, %cl
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    andb %bl, %al
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    testq %r9, %r9
+; AVX512-NEXT:    setns %al
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    setns %cl
+; AVX512-NEXT:    cmpb %al, %cl
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    addq %r8, %rdi
+; AVX512-NEXT:    adcq %r9, %rsi
+; AVX512-NEXT:    setns %bl
+; AVX512-NEXT:    cmpb %bl, %cl
+; AVX512-NEXT:    setne %cl
+; AVX512-NEXT:    andb %al, %cl
+; AVX512-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1
+; AVX512-NEXT:    movq %rdx, 16(%r10)
+; AVX512-NEXT:    movq %rdi, (%r10)
+; AVX512-NEXT:    movq %r14, 24(%r10)
+; AVX512-NEXT:    movq %rsi, 8(%r10)
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
+  %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
+  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i128> %val, <2 x i128>* %p2
+  ret <2 x i32> %res
+}
diff --git a/test/CodeGen/X86/vec_ssubo.ll b/test/CodeGen/X86/vec_ssubo.ll
new file mode 100644
index 0000000..0e1354b
--- /dev/null
+++ b/test/CodeGen/X86/vec_ssubo.ll
@@ -0,0 +1,2078 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32>, <1 x i32>)
+declare {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>)
+declare {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32>, <3 x i32>)
+declare {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32>, <6 x i32>)
+declare {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+declare {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24>, <4 x i24>)
+declare {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+declare {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128>, <2 x i128>)
+
+define <1 x i32> @ssubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
+; SSE-LABEL: ssubo_v1i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    subl %esi, %edi
+; SSE-NEXT:    seto %al
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    movl %edi, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ssubo_v1i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    subl %esi, %edi
+; AVX-NEXT:    seto %al
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    movl %edi, (%rdx)
+; AVX-NEXT:    retq
+  %t = call {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
+  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
+  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
+  %res = sext <1 x i1> %obit to <1 x i32>
+  store <1 x i32> %val, <1 x i32>* %p2
+  ret <1 x i32> %res
+}
+
+define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psllq $32, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    psubq %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psllq $32, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    movq %xmm1, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: ssubo_v2i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    psllq $32, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    psllq $32, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    psubq %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    psllq $32, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    movq %xmm1, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: ssubo_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psllq $32, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    psubq %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllq $32, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE41-NEXT:    movq %xmm1, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vmovq %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT:    vmovq %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm1
+; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT:    vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %t = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
+  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i32> %val, <2 x i32>* %p2
+  ret <2 x i32> %res
+}
+
+define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v3i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    movq %xmm0, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, 8(%rdi)
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: ssubo_v3i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pandn %xmm3, %xmm2
+; SSSE3-NEXT:    movq %xmm0, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm0, 8(%rdi)
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: ssubo_v3i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT:    psubd %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    pandn %xmm3, %xmm2
+; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi)
+; SSE41-NEXT:    movq %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v3i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX1-NEXT:    vmovq %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v3i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX2-NEXT:    vmovq %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v3i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX512-NEXT:    vmovq %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
+  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
+  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
+  %res = sext <3 x i1> %obit to <3 x i32>
+  store <3 x i32> %val, <3 x i32>* %p2
+  ret <3 x i32> %res
+}
+
+define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
+; SSE-LABEL: ssubo_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE-NEXT:    pxor %xmm4, %xmm2
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE-NEXT:    pxor %xmm4, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm4, %xmm3
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE-NEXT:    pxor %xmm4, %xmm3
+; SSE-NEXT:    pandn %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
+  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i32> %val, <4 x i32>* %p2
+  ret <4 x i32> %res
+}
+
+define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v6i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    movd %r8d, %xmm0
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd %edx, %xmm2
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0]
+; SSE2-NEXT:    movd %r9d, %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm2
+; SSE2-NEXT:    psubd %xmm6, %xmm0
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm7
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSE2-NEXT:    psubd %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm3
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    movq %xmm1, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    movq %xmm6, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm2, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: ssubo_v6i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movq %rdi, %rax
+; SSSE3-NEXT:    movd %r8d, %xmm0
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd %edx, %xmm2
+; SSSE3-NEXT:    movd %esi, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0]
+; SSSE3-NEXT:    movd %r9d, %xmm1
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    pxor %xmm7, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSSE3-NEXT:    pxor %xmm5, %xmm7
+; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm2
+; SSSE3-NEXT:    psubd %xmm6, %xmm0
+; SSSE3-NEXT:    pxor %xmm6, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm6
+; SSSE3-NEXT:    pandn %xmm6, %xmm2
+; SSSE3-NEXT:    pxor %xmm6, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm6
+; SSSE3-NEXT:    pxor %xmm7, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm7
+; SSSE3-NEXT:    pxor %xmm5, %xmm7
+; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSSE3-NEXT:    psubd %xmm4, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm3
+; SSSE3-NEXT:    pandn %xmm3, %xmm6
+; SSSE3-NEXT:    movq %xmm1, 16(%rcx)
+; SSSE3-NEXT:    movdqa %xmm0, (%rcx)
+; SSSE3-NEXT:    movq %xmm6, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm2, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: ssubo_v6i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq %rdi, %rax
+; SSE41-NEXT:    movd %esi, %xmm0
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm0
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT:    movd %r9d, %xmm1
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm6
+; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm6
+; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm6
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT:    pxor %xmm5, %xmm2
+; SSE41-NEXT:    pxor %xmm7, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm2
+; SSE41-NEXT:    psubd %xmm6, %xmm0
+; SSE41-NEXT:    pxor %xmm6, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    pandn %xmm6, %xmm2
+; SSE41-NEXT:    pxor %xmm6, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    pxor %xmm7, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6
+; SSE41-NEXT:    psubd %xmm3, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    pandn %xmm4, %xmm6
+; SSE41-NEXT:    movq %xmm1, 16(%rcx)
+; SSE41-NEXT:    movdqa %xmm0, (%rcx)
+; SSE41-NEXT:    movq %xmm6, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm2, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v6i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm8
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm4, %ymm8
+; AVX1-NEXT:    vpsubd %xmm9, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovq %xmm6, 16(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v6i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v6i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
+  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
+  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
+  %res = sext <6 x i1> %obit to <6 x i32>
+  store <6 x i32> %val, <6 x i32>* %p2
+  ret <6 x i32> %res
+}
+
+define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
+; SSE-LABEL: ssubo_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE-NEXT:    pxor %xmm6, %xmm4
+; SSE-NEXT:    pxor %xmm7, %xmm7
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm7
+; SSE-NEXT:    pxor %xmm6, %xmm7
+; SSE-NEXT:    pcmpeqd %xmm7, %xmm4
+; SSE-NEXT:    psubd %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm7, %xmm2
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm7
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm7
+; SSE-NEXT:    pxor %xmm6, %xmm7
+; SSE-NEXT:    pcmpeqd %xmm7, %xmm2
+; SSE-NEXT:    psubd %xmm3, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm6, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm7, %xmm5
+; SSE-NEXT:    pxor %xmm6, %xmm5
+; SSE-NEXT:    pandn %xmm5, %xmm2
+; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm8
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm4, %ymm8
+; AVX1-NEXT:    vpsubd %xmm9, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vandps %ymm1, %ymm8, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm2
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovaps %ymm2, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
+  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
+  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
+  %res = sext <8 x i1> %obit to <8 x i32>
+  store <8 x i32> %val, <8 x i32>* %p2
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
+; SSE-LABEL: ssubo_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm10, %xmm10
+; SSE-NEXT:    pxor %xmm8, %xmm8
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm8
+; SSE-NEXT:    pcmpeqd %xmm11, %xmm11
+; SSE-NEXT:    pxor %xmm11, %xmm8
+; SSE-NEXT:    pxor %xmm9, %xmm9
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm9
+; SSE-NEXT:    pxor %xmm11, %xmm9
+; SSE-NEXT:    pcmpeqd %xmm9, %xmm8
+; SSE-NEXT:    psubd %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm11, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSE-NEXT:    pxor %xmm11, %xmm4
+; SSE-NEXT:    pandn %xmm4, %xmm8
+; SSE-NEXT:    pxor %xmm9, %xmm9
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm9
+; SSE-NEXT:    pxor %xmm11, %xmm9
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE-NEXT:    pxor %xmm11, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm9
+; SSE-NEXT:    psubd %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm11, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE-NEXT:    pxor %xmm11, %xmm5
+; SSE-NEXT:    pandn %xmm5, %xmm9
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE-NEXT:    pxor %xmm11, %xmm4
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm11, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE-NEXT:    psubd %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm11, %xmm6
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE-NEXT:    pxor %xmm11, %xmm6
+; SSE-NEXT:    pandn %xmm6, %xmm4
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm7, %xmm5
+; SSE-NEXT:    pxor %xmm11, %xmm5
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE-NEXT:    pxor %xmm11, %xmm6
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE-NEXT:    psubd %xmm7, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm10
+; SSE-NEXT:    pxor %xmm11, %xmm10
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm10
+; SSE-NEXT:    pxor %xmm11, %xmm10
+; SSE-NEXT:    pandn %xmm10, %xmm5
+; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm8, %xmm0
+; SSE-NEXT:    movdqa %xmm9, %xmm1
+; SSE-NEXT:    movdqa %xmm4, %xmm2
+; SSE-NEXT:    movdqa %xmm5, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm8, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm9
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm12
+; AVX1-NEXT:    vpcmpgtd %xmm12, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm10
+; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm10, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm9
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm11
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm11, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm6, %ymm9
+; AVX1-NEXT:    vpsubd %xmm8, %xmm12, %xmm8
+; AVX1-NEXT:    vpcmpgtd %xmm8, %xmm5, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm10, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm10
+; AVX1-NEXT:    vpcmpgtd %xmm10, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm3, %ymm3
+; AVX1-NEXT:    vandps %ymm3, %ymm9, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT:    vpackssdw %xmm6, %xmm3, %xmm9
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm13
+; AVX1-NEXT:    vpcmpgtd %xmm13, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm3, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm11
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm12
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm12, %xmm7, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm6, %ymm11
+; AVX1-NEXT:    vpsubd %xmm13, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm6
+; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm5, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandps %ymm2, %ymm11, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm10, %ymm4
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpmovsxwd %xmm9, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps %ymm4, 32(%rdi)
+; AVX1-NEXT:    vmovaps %ymm3, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm5
+; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm4, %ymm7
+; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm7, %ymm1
+; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm5
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm4, %ymm7
+; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm0
+; AVX2-NEXT:    vpxor %ymm6, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm7, %ymm0
+; AVX2-NEXT:    vpxor %ymm6, %ymm0, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k0
+; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
+  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
+  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
+  %res = sext <16 x i1> %obit to <16 x i32>
+  store <16 x i32> %val, <16 x i32>* %p2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpeqb %xmm5, %xmm3
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pcmpeqb %xmm5, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    pslld $31, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    pslld $31, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm3
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: ssubo_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtb %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm3
+; SSSE3-NEXT:    psubb %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pandn %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSSE3-NEXT:    pslld $31, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    pslld $31, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm3
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: ssubo_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    pcmpgtb %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    pcmpeqb %xmm5, %xmm3
+; SSE41-NEXT:    psubb %xmm1, %xmm0
+; SSE41-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm5, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    pandn %xmm2, %xmm3
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm4
+; SSE41-NEXT:    psrad $31, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpgtb %xmm6, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovdqa %xmm6, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm5
+; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm6
+; AVX2-NEXT:    vpcmpgtb %xmm6, %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm1
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa %xmm6, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
+  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
+  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
+  %res = sext <16 x i1> %obit to <16 x i32>
+  store <16 x i8> %val, <16 x i8>* %p2
+  ret <16 x i32> %res
+}
+
+define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtw %xmm2, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpeqw %xmm5, %xmm1
+; SSE2-NEXT:    psubw %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtw %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pcmpeqw %xmm5, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    pslld $31, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: ssubo_v8i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpgtw %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpeqw %xmm5, %xmm1
+; SSSE3-NEXT:    psubw %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pcmpeqw %xmm5, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pandn %xmm3, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    pslld $31, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: ssubo_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pcmpgtw %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    pcmpgtw %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    pcmpeqw %xmm5, %xmm1
+; SSE41-NEXT:    psubw %xmm2, %xmm0
+; SSE41-NEXT:    pcmpgtw %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    pcmpeqw %xmm5, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    pandn %xmm3, %xmm1
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    pslld $31, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pslld $31, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm5
+; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm1, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
+  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
+  %res = sext <8 x i1> %obit to <8 x i32>
+  store <8 x i16> %val, <8 x i16>* %p2
+  ret <8 x i32> %res
+}
+
+define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm5, %xmm0
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: ssubo_v2i64:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    psubq %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm4
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSSE3-NEXT:    pxor %xmm5, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm0, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm5, %xmm0
+; SSSE3-NEXT:    pandn %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: ssubo_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psubq %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm6, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    pcmpeqq %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    pcmpeqq %xmm5, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    pandn %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k0, %k1, %k0
+; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k1
+; AVX512-NEXT:    kandw %k1, %k0, %k1
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    retq
+  %t = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
+  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i64> %val, <2 x i64>* %p2
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v4i24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pslld $8, %xmm1
+; SSE2-NEXT:    psrad $8, %xmm1
+; SSE2-NEXT:    pslld $8, %xmm2
+; SSE2-NEXT:    psrad $8, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pslld $8, %xmm0
+; SSE2-NEXT:    psrad $8, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    movw %ax, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %ecx
+; SSE2-NEXT:    movw %cx, 9(%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, %edx
+; SSE2-NEXT:    movw %dx, 6(%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %esi
+; SSE2-NEXT:    movw %si, 3(%rdi)
+; SSE2-NEXT:    shrl $16, %eax
+; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    shrl $16, %ecx
+; SSE2-NEXT:    movb %cl, 11(%rdi)
+; SSE2-NEXT:    shrl $16, %edx
+; SSE2-NEXT:    movb %dl, 8(%rdi)
+; SSE2-NEXT:    shrl $16, %esi
+; SSE2-NEXT:    movb %sil, 5(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: ssubo_v4i24:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pslld $8, %xmm1
+; SSSE3-NEXT:    psrad $8, %xmm1
+; SSSE3-NEXT:    pslld $8, %xmm2
+; SSSE3-NEXT:    psrad $8, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pslld $8, %xmm0
+; SSSE3-NEXT:    psrad $8, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    movw %ax, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %ecx
+; SSSE3-NEXT:    movw %cx, 9(%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, %edx
+; SSSE3-NEXT:    movw %dx, 6(%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %esi
+; SSSE3-NEXT:    movw %si, 3(%rdi)
+; SSSE3-NEXT:    shrl $16, %eax
+; SSSE3-NEXT:    movb %al, 2(%rdi)
+; SSSE3-NEXT:    shrl $16, %ecx
+; SSSE3-NEXT:    movb %cl, 11(%rdi)
+; SSSE3-NEXT:    shrl $16, %edx
+; SSSE3-NEXT:    movb %dl, 8(%rdi)
+; SSSE3-NEXT:    shrl $16, %esi
+; SSSE3-NEXT:    movb %sil, 5(%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: ssubo_v4i24:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pslld $8, %xmm1
+; SSE41-NEXT:    psrad $8, %xmm1
+; SSE41-NEXT:    pslld $8, %xmm2
+; SSE41-NEXT:    psrad $8, %xmm2
+; SSE41-NEXT:    psubd %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pslld $8, %xmm0
+; SSE41-NEXT:    psrad $8, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrd $3, %xmm2, %eax
+; SSE41-NEXT:    movw %ax, 9(%rdi)
+; SSE41-NEXT:    pextrd $2, %xmm2, %ecx
+; SSE41-NEXT:    movw %cx, 6(%rdi)
+; SSE41-NEXT:    pextrd $1, %xmm2, %edx
+; SSE41-NEXT:    movw %dx, 3(%rdi)
+; SSE41-NEXT:    movd %xmm2, %esi
+; SSE41-NEXT:    movw %si, (%rdi)
+; SSE41-NEXT:    shrl $16, %eax
+; SSE41-NEXT:    movb %al, 11(%rdi)
+; SSE41-NEXT:    shrl $16, %ecx
+; SSE41-NEXT:    movb %cl, 8(%rdi)
+; SSE41-NEXT:    shrl $16, %edx
+; SSE41-NEXT:    movb %dl, 5(%rdi)
+; SSE41-NEXT:    shrl $16, %esi
+; SSE41-NEXT:    movb %sil, 2(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v4i24:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpslld $8, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX1-NEXT:    movw %ax, 9(%rdi)
+; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT:    movw %cx, 6(%rdi)
+; AVX1-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX1-NEXT:    movw %dx, 3(%rdi)
+; AVX1-NEXT:    vmovd %xmm1, %esi
+; AVX1-NEXT:    movw %si, (%rdi)
+; AVX1-NEXT:    shrl $16, %eax
+; AVX1-NEXT:    movb %al, 11(%rdi)
+; AVX1-NEXT:    shrl $16, %ecx
+; AVX1-NEXT:    movb %cl, 8(%rdi)
+; AVX1-NEXT:    shrl $16, %edx
+; AVX1-NEXT:    movb %dl, 5(%rdi)
+; AVX1-NEXT:    shrl $16, %esi
+; AVX1-NEXT:    movb %sil, 2(%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v4i24:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $8, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $8, %xmm1, %xmm1
+; AVX2-NEXT:    vpslld $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpslld $8, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX2-NEXT:    movw %ax, 9(%rdi)
+; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT:    movw %cx, 6(%rdi)
+; AVX2-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX2-NEXT:    movw %dx, 3(%rdi)
+; AVX2-NEXT:    vmovd %xmm1, %esi
+; AVX2-NEXT:    movw %si, (%rdi)
+; AVX2-NEXT:    shrl $16, %eax
+; AVX2-NEXT:    movb %al, 11(%rdi)
+; AVX2-NEXT:    shrl $16, %ecx
+; AVX2-NEXT:    movb %cl, 8(%rdi)
+; AVX2-NEXT:    shrl $16, %edx
+; AVX2-NEXT:    movb %dl, 5(%rdi)
+; AVX2-NEXT:    shrl $16, %esi
+; AVX2-NEXT:    movb %sil, 2(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v4i24:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrad $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpslld $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0
+; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX512-NEXT:    movw %ax, 9(%rdi)
+; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX512-NEXT:    movw %cx, 6(%rdi)
+; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX512-NEXT:    movw %dx, 3(%rdi)
+; AVX512-NEXT:    vmovd %xmm1, %esi
+; AVX512-NEXT:    movw %si, (%rdi)
+; AVX512-NEXT:    shrl $16, %eax
+; AVX512-NEXT:    movb %al, 11(%rdi)
+; AVX512-NEXT:    shrl $16, %ecx
+; AVX512-NEXT:    movb %cl, 8(%rdi)
+; AVX512-NEXT:    shrl $16, %edx
+; AVX512-NEXT:    movb %dl, 5(%rdi)
+; AVX512-NEXT:    shrl $16, %esi
+; AVX512-NEXT:    movb %sil, 2(%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
+  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i24> %val, <4 x i24>* %p2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
+; SSE-LABEL: ssubo_v4i1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pslld $31, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pslld $31, %xmm0
+; SSE-NEXT:    psrad $31, %xmm0
+; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pslld $31, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movmskps %xmm1, %eax
+; SSE-NEXT:    movb %al, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v4i1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovmskps %xmm1, %eax
+; AVX1-NEXT:    movb %al, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v4i1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovmskps %xmm1, %eax
+; AVX2-NEXT:    movb %al, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v4i1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vpslld $31, %xmm1, %xmm0
+; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k2
+; AVX512-NEXT:    kxorw %k2, %k1, %k3
+; AVX512-NEXT:    kxorw %k2, %k0, %k0
+; AVX512-NEXT:    kxnorw %k0, %k1, %k1
+; AVX512-NEXT:    kandnw %k1, %k3, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
+  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i1> %val, <4 x i1>* %p2
+  ret <4 x i32> %res
+}
+
+define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
+; SSE2-LABEL: ssubo_v2i128:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    sbbq %r11, %rax
+; SSE2-NEXT:    setns %bl
+; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    setns %cl
+; SSE2-NEXT:    cmpb %bl, %cl
+; SSE2-NEXT:    setne %bpl
+; SSE2-NEXT:    testq %r11, %r11
+; SSE2-NEXT:    setns %bl
+; SSE2-NEXT:    cmpb %bl, %cl
+; SSE2-NEXT:    setne %cl
+; SSE2-NEXT:    andb %bpl, %cl
+; SSE2-NEXT:    movzbl %cl, %ebp
+; SSE2-NEXT:    testq %r9, %r9
+; SSE2-NEXT:    setns %bl
+; SSE2-NEXT:    testq %rsi, %rsi
+; SSE2-NEXT:    setns %cl
+; SSE2-NEXT:    cmpb %bl, %cl
+; SSE2-NEXT:    setne %r11b
+; SSE2-NEXT:    subq %r8, %rdi
+; SSE2-NEXT:    sbbq %r9, %rsi
+; SSE2-NEXT:    setns %bl
+; SSE2-NEXT:    cmpb %bl, %cl
+; SSE2-NEXT:    setne %cl
+; SSE2-NEXT:    andb %r11b, %cl
+; SSE2-NEXT:    movzbl %cl, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    pinsrw $4, %ebp, %xmm0
+; SSE2-NEXT:    movq %rdx, 16(%r10)
+; SSE2-NEXT:    movq %rdi, (%r10)
+; SSE2-NEXT:    movq %rax, 24(%r10)
+; SSE2-NEXT:    movq %rsi, 8(%r10)
+; SSE2-NEXT:    psllq $63, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: ssubo_v2i128:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pushq %rbp
+; SSSE3-NEXT:    pushq %rbx
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSSE3-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; SSSE3-NEXT:    movq %rcx, %rax
+; SSSE3-NEXT:    sbbq %r11, %rax
+; SSSE3-NEXT:    setns %bl
+; SSSE3-NEXT:    testq %rcx, %rcx
+; SSSE3-NEXT:    setns %cl
+; SSSE3-NEXT:    cmpb %bl, %cl
+; SSSE3-NEXT:    setne %bpl
+; SSSE3-NEXT:    testq %r11, %r11
+; SSSE3-NEXT:    setns %bl
+; SSSE3-NEXT:    cmpb %bl, %cl
+; SSSE3-NEXT:    setne %cl
+; SSSE3-NEXT:    andb %bpl, %cl
+; SSSE3-NEXT:    movzbl %cl, %ebp
+; SSSE3-NEXT:    testq %r9, %r9
+; SSSE3-NEXT:    setns %bl
+; SSSE3-NEXT:    testq %rsi, %rsi
+; SSSE3-NEXT:    setns %cl
+; SSSE3-NEXT:    cmpb %bl, %cl
+; SSSE3-NEXT:    setne %r11b
+; SSSE3-NEXT:    subq %r8, %rdi
+; SSSE3-NEXT:    sbbq %r9, %rsi
+; SSSE3-NEXT:    setns %bl
+; SSSE3-NEXT:    cmpb %bl, %cl
+; SSSE3-NEXT:    setne %cl
+; SSSE3-NEXT:    andb %r11b, %cl
+; SSSE3-NEXT:    movzbl %cl, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    pinsrw $4, %ebp, %xmm0
+; SSSE3-NEXT:    movq %rdx, 16(%r10)
+; SSSE3-NEXT:    movq %rdi, (%r10)
+; SSSE3-NEXT:    movq %rax, 24(%r10)
+; SSSE3-NEXT:    movq %rsi, 8(%r10)
+; SSSE3-NEXT:    psllq $63, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    popq %rbx
+; SSSE3-NEXT:    popq %rbp
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: ssubo_v2i128:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pushq %rbp
+; SSE41-NEXT:    pushq %rbx
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE41-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT:    movq %rcx, %rax
+; SSE41-NEXT:    sbbq %r11, %rax
+; SSE41-NEXT:    setns %bl
+; SSE41-NEXT:    testq %rcx, %rcx
+; SSE41-NEXT:    setns %cl
+; SSE41-NEXT:    cmpb %bl, %cl
+; SSE41-NEXT:    setne %bpl
+; SSE41-NEXT:    testq %r11, %r11
+; SSE41-NEXT:    setns %bl
+; SSE41-NEXT:    cmpb %bl, %cl
+; SSE41-NEXT:    setne %cl
+; SSE41-NEXT:    andb %bpl, %cl
+; SSE41-NEXT:    movzbl %cl, %ebp
+; SSE41-NEXT:    testq %r9, %r9
+; SSE41-NEXT:    setns %bl
+; SSE41-NEXT:    testq %rsi, %rsi
+; SSE41-NEXT:    setns %cl
+; SSE41-NEXT:    cmpb %bl, %cl
+; SSE41-NEXT:    setne %r11b
+; SSE41-NEXT:    subq %r8, %rdi
+; SSE41-NEXT:    sbbq %r9, %rsi
+; SSE41-NEXT:    setns %bl
+; SSE41-NEXT:    cmpb %bl, %cl
+; SSE41-NEXT:    setne %cl
+; SSE41-NEXT:    andb %r11b, %cl
+; SSE41-NEXT:    movzbl %cl, %ecx
+; SSE41-NEXT:    movd %ecx, %xmm0
+; SSE41-NEXT:    pinsrb $8, %ebp, %xmm0
+; SSE41-NEXT:    movq %rdx, 16(%r10)
+; SSE41-NEXT:    movq %rdi, (%r10)
+; SSE41-NEXT:    movq %rax, 24(%r10)
+; SSE41-NEXT:    movq %rsi, 8(%r10)
+; SSE41-NEXT:    psllq $63, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    popq %rbx
+; SSE41-NEXT:    popq %rbp
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: ssubo_v2i128:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX1-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT:    movq %rcx, %rax
+; AVX1-NEXT:    sbbq %r11, %rax
+; AVX1-NEXT:    setns %bl
+; AVX1-NEXT:    testq %rcx, %rcx
+; AVX1-NEXT:    setns %cl
+; AVX1-NEXT:    cmpb %bl, %cl
+; AVX1-NEXT:    setne %bpl
+; AVX1-NEXT:    testq %r11, %r11
+; AVX1-NEXT:    setns %bl
+; AVX1-NEXT:    cmpb %bl, %cl
+; AVX1-NEXT:    setne %cl
+; AVX1-NEXT:    andb %bpl, %cl
+; AVX1-NEXT:    movzbl %cl, %ebp
+; AVX1-NEXT:    testq %r9, %r9
+; AVX1-NEXT:    setns %bl
+; AVX1-NEXT:    testq %rsi, %rsi
+; AVX1-NEXT:    setns %cl
+; AVX1-NEXT:    cmpb %bl, %cl
+; AVX1-NEXT:    setne %r11b
+; AVX1-NEXT:    subq %r8, %rdi
+; AVX1-NEXT:    sbbq %r9, %rsi
+; AVX1-NEXT:    setns %bl
+; AVX1-NEXT:    cmpb %bl, %cl
+; AVX1-NEXT:    setne %cl
+; AVX1-NEXT:    andb %r11b, %cl
+; AVX1-NEXT:    movzbl %cl, %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm0
+; AVX1-NEXT:    vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rdx, 16(%r10)
+; AVX1-NEXT:    movq %rdi, (%r10)
+; AVX1-NEXT:    movq %rax, 24(%r10)
+; AVX1-NEXT:    movq %rsi, 8(%r10)
+; AVX1-NEXT:    vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ssubo_v2i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    sbbq %r11, %rax
+; AVX2-NEXT:    setns %bl
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    setns %cl
+; AVX2-NEXT:    cmpb %bl, %cl
+; AVX2-NEXT:    setne %bpl
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    setns %bl
+; AVX2-NEXT:    cmpb %bl, %cl
+; AVX2-NEXT:    setne %cl
+; AVX2-NEXT:    andb %bpl, %cl
+; AVX2-NEXT:    movzbl %cl, %ebp
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    setns %bl
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    setns %cl
+; AVX2-NEXT:    cmpb %bl, %cl
+; AVX2-NEXT:    setne %r11b
+; AVX2-NEXT:    subq %r8, %rdi
+; AVX2-NEXT:    sbbq %r9, %rsi
+; AVX2-NEXT:    setns %bl
+; AVX2-NEXT:    cmpb %bl, %cl
+; AVX2-NEXT:    setne %cl
+; AVX2-NEXT:    andb %r11b, %cl
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    vmovd %ecx, %xmm0
+; AVX2-NEXT:    vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rdx, 16(%r10)
+; AVX2-NEXT:    movq %rdi, (%r10)
+; AVX2-NEXT:    movq %rax, 24(%r10)
+; AVX2-NEXT:    movq %rsi, 8(%r10)
+; AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ssubo_v2i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    movq %rcx, %r14
+; AVX512-NEXT:    sbbq %r11, %r14
+; AVX512-NEXT:    setns %bl
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    setns %cl
+; AVX512-NEXT:    cmpb %bl, %cl
+; AVX512-NEXT:    setne %bl
+; AVX512-NEXT:    testq %r11, %r11
+; AVX512-NEXT:    setns %al
+; AVX512-NEXT:    cmpb %al, %cl
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    andb %bl, %al
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    testq %r9, %r9
+; AVX512-NEXT:    setns %al
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    setns %cl
+; AVX512-NEXT:    cmpb %al, %cl
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    subq %r8, %rdi
+; AVX512-NEXT:    sbbq %r9, %rsi
+; AVX512-NEXT:    setns %bl
+; AVX512-NEXT:    cmpb %bl, %cl
+; AVX512-NEXT:    setne %cl
+; AVX512-NEXT:    andb %al, %cl
+; AVX512-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1
+; AVX512-NEXT:    movq %rdx, 16(%r10)
+; AVX512-NEXT:    movq %rdi, (%r10)
+; AVX512-NEXT:    movq %r14, 24(%r10)
+; AVX512-NEXT:    movq %rsi, 8(%r10)
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
+  %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
+  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i128> %val, <2 x i128>* %p2
+  ret <2 x i32> %res
+}
diff --git a/test/CodeGen/X86/vec_uaddo.ll b/test/CodeGen/X86/vec_uaddo.ll
new file mode 100644
index 0000000..b040cd9
--- /dev/null
+++ b/test/CodeGen/X86/vec_uaddo.ll
@@ -0,0 +1,1381 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32>, <1 x i32>)
+declare {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>)
+declare {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32>, <3 x i32>)
+declare {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32>, <6 x i32>)
+declare {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+declare {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24>, <4 x i24>)
+declare {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+declare {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128>, <2 x i128>)
+
+define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
+; SSE-LABEL: uaddo_v1i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addl %esi, %edi
+; SSE-NEXT:    sbbl %eax, %eax
+; SSE-NEXT:    movl %edi, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uaddo_v1i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    addl %esi, %edi
+; AVX-NEXT:    sbbl %eax, %eax
+; AVX-NEXT:    movl %edi, (%rdx)
+; AVX-NEXT:    retq
+  %t = call {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
+  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
+  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
+  %res = sext <1 x i1> %obit to <1 x i32>
+  store <1 x i32> %val, <1 x i32>* %p2
+  ret <1 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    movq %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: uaddo_v2i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    paddq %xmm1, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    movq %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: uaddo_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    paddq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    pcmpeqq %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT:    movq %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vmovq %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT:    vmovq %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT:    vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT:    vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
+  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i32> %val, <2 x i32>* %p2
+  ret <2 x i32> %res
+}
+
+define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v3i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT:    movq %xmm1, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, 8(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: uaddo_v3i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT:    movq %xmm1, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, 8(%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: uaddo_v3i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pextrd $2, %xmm1, 8(%rdi)
+; SSE41-NEXT:    movq %xmm1, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v3i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX1-NEXT:    vmovq %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v3i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX2-NEXT:    vmovq %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v3i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpltud %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX512-NEXT:    vmovq %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
+  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
+  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
+  %res = sext <3 x i1> %obit to <3 x i32>
+  store <3 x i32> %val, <3 x i32>* %p2
+  ret <3 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: uaddo_v4i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: uaddo_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpltud %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
+  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i32> %val, <4 x i32>* %p2
+  ret <4 x i32> %res
+}
+
+define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v6i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movd %r8d, %xmm1
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    movd %edx, %xmm3
+; SSE2-NEXT:    movd %esi, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    movd %r9d, %xmm2
+; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    movq %xmm3, 16(%rcx)
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    movq %xmm2, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm1, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: uaddo_v6i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movq %rdi, %rax
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movd %r8d, %xmm1
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    movd %edx, %xmm3
+; SSSE3-NEXT:    movd %esi, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSSE3-NEXT:    movd %r9d, %xmm2
+; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, (%rcx)
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    paddd %xmm2, %xmm3
+; SSSE3-NEXT:    movq %xmm3, 16(%rcx)
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    movq %xmm2, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: uaddo_v6i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq %rdi, %rax
+; SSE41-NEXT:    movd %esi, %xmm0
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm0
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT:    movd %r9d, %xmm2
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
+; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT:    paddd %xmm0, %xmm3
+; SSE41-NEXT:    pmaxud %xmm3, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm4, %xmm0
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    pmaxud %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    movq %xmm1, 16(%rcx)
+; SSE41-NEXT:    movdqa %xmm3, (%rcx)
+; SSE41-NEXT:    movq %xmm2, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v6i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v6i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpmaxud %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v6i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpltud %ymm0, %ymm1, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
+  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
+  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
+  %res = sext <6 x i1> %obit to <6 x i32>
+  store <6 x i32> %val, <6 x i32>* %p2
+  ret <6 x i32> %res
+}
+
+define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, (%rdi)
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT:    paddd %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: uaddo_v8i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    paddd %xmm0, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, (%rdi)
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT:    paddd %xmm1, %xmm3
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: uaddo_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    paddd %xmm0, %xmm2
+; SSE41-NEXT:    pmaxud %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm4, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm3
+; SSE41-NEXT:    pmaxud %xmm3, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm2, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovaps %ymm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpmaxud %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpltud %ymm0, %ymm1, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
+  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
+  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
+  %res = sext <8 x i1> %obit to <8 x i32>
+  store <8 x i32> %val, <8 x i32>* %p2
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, (%rdi)
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE2-NEXT:    paddd %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm5, 16(%rdi)
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT:    paddd %xmm3, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pxor %xmm7, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm7, 48(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: uaddo_v16i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    paddd %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
+; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSSE3-NEXT:    paddd %xmm1, %xmm5
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm5, 16(%rdi)
+; SSSE3-NEXT:    pxor %xmm8, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT:    paddd %xmm2, %xmm6
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
+; SSSE3-NEXT:    movdqa %xmm6, 32(%rdi)
+; SSSE3-NEXT:    pxor %xmm8, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT:    paddd %xmm3, %xmm7
+; SSSE3-NEXT:    pxor %xmm8, %xmm3
+; SSSE3-NEXT:    pxor %xmm7, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSSE3-NEXT:    movdqa %xmm7, 48(%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: uaddo_v16i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    paddd %xmm0, %xmm4
+; SSE41-NEXT:    pmaxud %xmm4, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm8
+; SSE41-NEXT:    pxor %xmm8, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm5
+; SSE41-NEXT:    pmaxud %xmm5, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm1
+; SSE41-NEXT:    pxor %xmm8, %xmm1
+; SSE41-NEXT:    paddd %xmm2, %xmm6
+; SSE41-NEXT:    pmaxud %xmm6, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm2
+; SSE41-NEXT:    pxor %xmm8, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm7
+; SSE41-NEXT:    pmaxud %xmm7, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm3
+; SSE41-NEXT:    pxor %xmm8, %xmm3
+; SSE41-NEXT:    movdqa %xmm7, 48(%rdi)
+; SSE41-NEXT:    movdqa %xmm6, 32(%rdi)
+; SSE41-NEXT:    movdqa %xmm5, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm4, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpaddd %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpmaxud %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT:    vmovaps %ymm3, 32(%rdi)
+; AVX1-NEXT:    vmovaps %ymm2, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxud %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpcmpltud %zmm0, %zmm1, %k1
+; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
+  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
+  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
+  %res = sext <16 x i1> %obit to <16 x i32>
+  store <16 x i32> %val, <16 x i32>* %p2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pslld $31, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    pslld $31, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm3
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, (%rdi)
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: uaddo_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    paddb %xmm0, %xmm1
+; SSSE3-NEXT:    pmaxub %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSSE3-NEXT:    pxor %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pslld $31, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    pslld $31, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm3
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: uaddo_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pmaxub %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm4
+; SSE41-NEXT:    psrad $31, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, (%rdi)
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpmaxub %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpltub %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
+  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
+  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
+  %res = sext <16 x i1> %obit to <16 x i32>
+  store <16 x i8> %val, <16 x i8>* %p2
+  ret <16 x i32> %res
+}
+
+define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pslld $31, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, (%rdi)
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: uaddo_v8i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT:    paddw %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pslld $31, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: uaddo_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm2
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE41-NEXT:    pslld $31, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE41-NEXT:    pslld $31, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpmaxuw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpmaxuw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpltuw %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
+  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
+  %res = sext <8 x i1> %obit to <8 x i32>
+  store <8 x i16> %val, <8 x i16>* %p2
+  ret <8 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
+; SSE-LABEL: uaddo_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT:    paddq %xmm0, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm1, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpltuq %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    retq
+  %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
+  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i64> %val, <2 x i64>* %p2
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v4i24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    movw %ax, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %ecx
+; SSE2-NEXT:    movw %cx, 9(%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, %edx
+; SSE2-NEXT:    movw %dx, 6(%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %esi
+; SSE2-NEXT:    movw %si, 3(%rdi)
+; SSE2-NEXT:    shrl $16, %eax
+; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    shrl $16, %ecx
+; SSE2-NEXT:    movb %cl, 11(%rdi)
+; SSE2-NEXT:    shrl $16, %edx
+; SSE2-NEXT:    movb %dl, 8(%rdi)
+; SSE2-NEXT:    shrl $16, %esi
+; SSE2-NEXT:    movb %sil, 5(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: uaddo_v4i24:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    paddd %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm0
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    movw %ax, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %ecx
+; SSSE3-NEXT:    movw %cx, 9(%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, %edx
+; SSSE3-NEXT:    movw %dx, 6(%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %esi
+; SSSE3-NEXT:    movw %si, 3(%rdi)
+; SSSE3-NEXT:    shrl $16, %eax
+; SSSE3-NEXT:    movb %al, 2(%rdi)
+; SSSE3-NEXT:    shrl $16, %ecx
+; SSSE3-NEXT:    movb %cl, 11(%rdi)
+; SSSE3-NEXT:    shrl $16, %edx
+; SSSE3-NEXT:    movb %dl, 8(%rdi)
+; SSSE3-NEXT:    shrl $16, %esi
+; SSSE3-NEXT:    movb %sil, 5(%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: uaddo_v4i24:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pextrd $3, %xmm0, %eax
+; SSE41-NEXT:    movw %ax, 9(%rdi)
+; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
+; SSE41-NEXT:    movw %cx, 6(%rdi)
+; SSE41-NEXT:    pextrd $1, %xmm0, %edx
+; SSE41-NEXT:    movw %dx, 3(%rdi)
+; SSE41-NEXT:    movd %xmm0, %esi
+; SSE41-NEXT:    movw %si, (%rdi)
+; SSE41-NEXT:    shrl $16, %eax
+; SSE41-NEXT:    movb %al, 11(%rdi)
+; SSE41-NEXT:    shrl $16, %ecx
+; SSE41-NEXT:    movb %cl, 8(%rdi)
+; SSE41-NEXT:    shrl $16, %edx
+; SSE41-NEXT:    movb %dl, 5(%rdi)
+; SSE41-NEXT:    shrl $16, %esi
+; SSE41-NEXT:    movb %sil, 2(%rdi)
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v4i24:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2.35098856E-38,2.35098856E-38,2.35098856E-38,2.35098856E-38]
+; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX1-NEXT:    movw %ax, 9(%rdi)
+; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT:    movw %cx, 6(%rdi)
+; AVX1-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX1-NEXT:    movw %dx, 3(%rdi)
+; AVX1-NEXT:    vmovd %xmm1, %esi
+; AVX1-NEXT:    movw %si, (%rdi)
+; AVX1-NEXT:    shrl $16, %eax
+; AVX1-NEXT:    movb %al, 11(%rdi)
+; AVX1-NEXT:    shrl $16, %ecx
+; AVX1-NEXT:    movb %cl, 8(%rdi)
+; AVX1-NEXT:    shrl $16, %edx
+; AVX1-NEXT:    movb %dl, 5(%rdi)
+; AVX1-NEXT:    shrl $16, %esi
+; AVX1-NEXT:    movb %sil, 2(%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v4i24:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX2-NEXT:    movw %ax, 9(%rdi)
+; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT:    movw %cx, 6(%rdi)
+; AVX2-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX2-NEXT:    movw %dx, 3(%rdi)
+; AVX2-NEXT:    vmovd %xmm1, %esi
+; AVX2-NEXT:    movw %si, (%rdi)
+; AVX2-NEXT:    shrl $16, %eax
+; AVX2-NEXT:    movb %al, 11(%rdi)
+; AVX2-NEXT:    shrl $16, %ecx
+; AVX2-NEXT:    movb %cl, 8(%rdi)
+; AVX2-NEXT:    shrl $16, %edx
+; AVX2-NEXT:    movb %dl, 5(%rdi)
+; AVX2-NEXT:    shrl $16, %esi
+; AVX2-NEXT:    movb %sil, 2(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v4i24:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX512-NEXT:    movw %ax, 9(%rdi)
+; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX512-NEXT:    movw %cx, 6(%rdi)
+; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX512-NEXT:    movw %dx, 3(%rdi)
+; AVX512-NEXT:    vmovd %xmm1, %esi
+; AVX512-NEXT:    movw %si, (%rdi)
+; AVX512-NEXT:    shrl $16, %eax
+; AVX512-NEXT:    movb %al, 11(%rdi)
+; AVX512-NEXT:    shrl $16, %ecx
+; AVX512-NEXT:    movb %cl, 8(%rdi)
+; AVX512-NEXT:    shrl $16, %edx
+; AVX512-NEXT:    movb %dl, 5(%rdi)
+; AVX512-NEXT:    shrl $16, %esi
+; AVX512-NEXT:    movb %sil, 2(%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
+  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i24> %val, <4 x i24>* %p2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
+; SSE-LABEL: uaddo_v4i1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pslld $31, %xmm0
+; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    movb %al, (%rdi)
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v4i1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vmovmskps %xmm1, %eax
+; AVX1-NEXT:    movb %al, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v4i1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vmovmskps %xmm1, %eax
+; AVX2-NEXT:    movb %al, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v4i1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k2
+; AVX512-NEXT:    kxnorw %k1, %k0, %k1
+; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1 {%k1}
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    kmovd %k2, %eax
+; AVX512-NEXT:    movb %al, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
+  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i1> %val, <4 x i1>* %p2
+  ret <4 x i32> %res
+}
+
+define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
+; SSE2-LABEL: uaddo_v2i128:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    setb %al
+; SSE2-NEXT:    movzbl %al, %r11d
+; SSE2-NEXT:    addq %r8, %rdi
+; SSE2-NEXT:    adcq %r9, %rsi
+; SSE2-NEXT:    setb %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pinsrw $4, %r11d, %xmm0
+; SSE2-NEXT:    movq %rdx, 16(%r10)
+; SSE2-NEXT:    movq %rdi, (%r10)
+; SSE2-NEXT:    movq %rcx, 24(%r10)
+; SSE2-NEXT:    movq %rsi, 8(%r10)
+; SSE2-NEXT:    psllq $63, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: uaddo_v2i128:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; SSSE3-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT:    setb %al
+; SSSE3-NEXT:    movzbl %al, %r11d
+; SSSE3-NEXT:    addq %r8, %rdi
+; SSSE3-NEXT:    adcq %r9, %rsi
+; SSSE3-NEXT:    setb %al
+; SSSE3-NEXT:    movzbl %al, %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pinsrw $4, %r11d, %xmm0
+; SSSE3-NEXT:    movq %rdx, 16(%r10)
+; SSSE3-NEXT:    movq %rdi, (%r10)
+; SSSE3-NEXT:    movq %rcx, 24(%r10)
+; SSSE3-NEXT:    movq %rsi, 8(%r10)
+; SSSE3-NEXT:    psllq $63, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: uaddo_v2i128:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT:    setb %al
+; SSE41-NEXT:    movzbl %al, %r11d
+; SSE41-NEXT:    addq %r8, %rdi
+; SSE41-NEXT:    adcq %r9, %rsi
+; SSE41-NEXT:    setb %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pinsrb $8, %r11d, %xmm0
+; SSE41-NEXT:    movq %rdx, 16(%r10)
+; SSE41-NEXT:    movq %rdi, (%r10)
+; SSE41-NEXT:    movq %rcx, 24(%r10)
+; SSE41-NEXT:    movq %rsi, 8(%r10)
+; SSE41-NEXT:    psllq $63, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: uaddo_v2i128:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; AVX1-NEXT:    setb %al
+; AVX1-NEXT:    movzbl %al, %r11d
+; AVX1-NEXT:    addq %r8, %rdi
+; AVX1-NEXT:    adcq %r9, %rsi
+; AVX1-NEXT:    setb %al
+; AVX1-NEXT:    movzbl %al, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rdx, 16(%r10)
+; AVX1-NEXT:    movq %rdi, (%r10)
+; AVX1-NEXT:    movq %rcx, 24(%r10)
+; AVX1-NEXT:    movq %rsi, 8(%r10)
+; AVX1-NEXT:    vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v2i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT:    setb %al
+; AVX2-NEXT:    movzbl %al, %r11d
+; AVX2-NEXT:    addq %r8, %rdi
+; AVX2-NEXT:    adcq %r9, %rsi
+; AVX2-NEXT:    setb %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rdx, 16(%r10)
+; AVX2-NEXT:    movq %rdi, (%r10)
+; AVX2-NEXT:    movq %rcx, 24(%r10)
+; AVX2-NEXT:    movq %rsi, 8(%r10)
+; AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: uaddo_v2i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    addq %r8, %rdi
+; AVX512-NEXT:    adcq %r9, %rsi
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1
+; AVX512-NEXT:    movq %rdx, 16(%r10)
+; AVX512-NEXT:    movq %rdi, (%r10)
+; AVX512-NEXT:    movq %rcx, 24(%r10)
+; AVX512-NEXT:    movq %rsi, 8(%r10)
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    retq
+  %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
+  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i128> %val, <2 x i128>* %p2
+  ret <2 x i32> %res
+}
diff --git a/test/CodeGen/X86/vec_usubo.ll b/test/CodeGen/X86/vec_usubo.ll
new file mode 100644
index 0000000..10de326
--- /dev/null
+++ b/test/CodeGen/X86/vec_usubo.ll
@@ -0,0 +1,1422 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
+
+declare {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32>, <1 x i32>)
+declare {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>)
+declare {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32>, <3 x i32>)
+declare {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32>, <6 x i32>)
+declare {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+declare {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24>, <4 x i24>)
+declare {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+declare {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128>, <2 x i128>)
+
+define <1 x i32> @usubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
+; SSE-LABEL: usubo_v1i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    subl %esi, %edi
+; SSE-NEXT:    sbbl %eax, %eax
+; SSE-NEXT:    movl %edi, (%rdx)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: usubo_v1i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    subl %esi, %edi
+; AVX-NEXT:    sbbl %eax, %eax
+; AVX-NEXT:    movl %edi, (%rdx)
+; AVX-NEXT:    retq
+  %t = call {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
+  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
+  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
+  %res = sext <1 x i1> %obit to <1 x i32>
+  store <1 x i32> %val, <1 x i32>* %p2
+  ret <1 x i32> %res
+}
+
+define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    movq %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: usubo_v2i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    psubq %xmm1, %xmm0
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    movq %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: usubo_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    psubq %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    pcmpeqq %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT:    movq %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vmovq %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT:    vmovq %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v2i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512-NEXT:    vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT:    vpcmpeqq %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %t = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
+  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i32> %val, <2 x i32>* %p2
+  ret <2 x i32> %res
+}
+
+define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v3i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    movq %xmm0, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, 8(%rdi)
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: usubo_v3i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    psubd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    movq %xmm0, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm0, 8(%rdi)
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: usubo_v3i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psubd %xmm1, %xmm2
+; SSE41-NEXT:    pminud %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pextrd $2, %xmm2, 8(%rdi)
+; SSE41-NEXT:    movq %xmm2, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v3i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX1-NEXT:    vmovq %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v3i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpminud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX2-NEXT:    vmovq %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v3i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnleud %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
+; AVX512-NEXT:    vmovq %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
+  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
+  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
+  %res = sext <3 x i1> %obit to <3 x i32>
+  store <3 x i32> %val, <3 x i32>* %p2
+  ret <3 x i32> %res
+}
+
+define <4 x i32> @usubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: usubo_v4i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    psubd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: usubo_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psubd %xmm1, %xmm2
+; SSE41-NEXT:    pminud %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpminud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnleud %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
+  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i32> %val, <4 x i32>* %p2
+  ret <4 x i32> %res
+}
+
+define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v6i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT:    movd %r8d, %xmm0
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd %edx, %xmm3
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    movd %r9d, %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psubd %xmm2, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm4, (%rcx)
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psubd %xmm3, %xmm0
+; SSE2-NEXT:    movq %xmm0, 16(%rcx)
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    movq %xmm0, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm4, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: usubo_v6i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movq %rdi, %rax
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT:    movd %r8d, %xmm0
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movd %edx, %xmm3
+; SSSE3-NEXT:    movd %esi, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT:    movd %r9d, %xmm1
+; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    psubd %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm4, (%rcx)
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    psubd %xmm3, %xmm0
+; SSSE3-NEXT:    movq %xmm0, 16(%rcx)
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT:    movq %xmm0, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: usubo_v6i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq %rdi, %rax
+; SSE41-NEXT:    movd %esi, %xmm0
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm0
+; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0
+; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT:    movd %r9d, %xmm2
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
+; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psubd %xmm3, %xmm4
+; SSE41-NEXT:    pminud %xmm4, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    psubd %xmm1, %xmm5
+; SSE41-NEXT:    pminud %xmm5, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm2
+; SSE41-NEXT:    movq %xmm5, 16(%rcx)
+; SSE41-NEXT:    movdqa %xmm4, (%rcx)
+; SSE41-NEXT:    movq %xmm2, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v6i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v6i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpminud %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v6i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpnleud %ymm0, %ymm1, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
+  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
+  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
+  %res = sext <6 x i1> %obit to <6 x i32>
+  store <6 x i32> %val, <6 x i32>* %p2
+  ret <6 x i32> %res
+}
+
+define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    psubd %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: usubo_v8i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    psubd %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    psubd %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: usubo_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psubd %xmm2, %xmm4
+; SSE41-NEXT:    pminud %xmm4, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psubd %xmm3, %xmm5
+; SSE41-NEXT:    pminud %xmm5, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm5, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm4, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovaps %ymm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpminud %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpcmpnleud %ymm0, %ymm1, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
+  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
+  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
+  %res = sext <8 x i1> %obit to <8 x i32>
+  store <8 x i32> %val, <8 x i32>* %p2
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
+; SSE2-LABEL: usubo_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    psubd %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    psubd %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    psubd %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    psubd %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: usubo_v16i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm9
+; SSSE3-NEXT:    pxor %xmm8, %xmm9
+; SSSE3-NEXT:    psubd %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    psubd %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    psubd %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    psubd %xmm7, %xmm3
+; SSSE3-NEXT:    pxor %xmm3, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm8
+; SSSE3-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSSE3-NEXT:    movdqa %xmm8, %xmm3
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: usubo_v16i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm8
+; SSE41-NEXT:    psubd %xmm4, %xmm8
+; SSE41-NEXT:    pminud %xmm8, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm9
+; SSE41-NEXT:    pxor %xmm9, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psubd %xmm5, %xmm4
+; SSE41-NEXT:    pminud %xmm4, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm9, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    psubd %xmm6, %xmm5
+; SSE41-NEXT:    pminud %xmm5, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE41-NEXT:    pxor %xmm9, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    psubd %xmm7, %xmm6
+; SSE41-NEXT:    pminud %xmm6, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm3
+; SSE41-NEXT:    pxor %xmm9, %xmm3
+; SSE41-NEXT:    movdqa %xmm6, 48(%rdi)
+; SSE41-NEXT:    movdqa %xmm5, 32(%rdi)
+; SSE41-NEXT:    movdqa %xmm4, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm8, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpminud %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpminud %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpsubd %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpminud %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm5, %xmm7
+; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpminud %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT:    vmovaps %ymm3, 32(%rdi)
+; AVX1-NEXT:    vmovaps %ymm2, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpminud %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpminud %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpcmpnleud %zmm0, %zmm1, %k1
+; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
+  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
+  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
+  %res = sext <16 x i1> %obit to <16 x i32>
+  store <16 x i32> %val, <16 x i32>* %p2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
+; SSE2-LABEL: usubo_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psubb %xmm1, %xmm4
+; SSE2-NEXT:    pminub %xmm4, %xmm0
+; SSE2-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pslld $31, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    pslld $31, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm3
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    movdqa %xmm4, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: usubo_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    psubb %xmm1, %xmm4
+; SSSE3-NEXT:    pminub %xmm4, %xmm0
+; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSSE3-NEXT:    pxor %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pslld $31, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    pslld $31, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm3
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: usubo_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psubb %xmm1, %xmm4
+; SSE41-NEXT:    pminub %xmm4, %xmm0
+; SSE41-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpminub %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpminub %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnleub %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
+  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
+  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
+  %res = sext <16 x i1> %obit to <16 x i32>
+  store <16 x i8> %val, <16 x i8>* %p2
+  ret <16 x i32> %res
+}
+
+define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
+; SSE2-LABEL: usubo_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubw %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pslld $31, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pslld $31, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: usubo_v8i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    psubw %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    pslld $31, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    pslld $31, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: usubo_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psubw %xmm1, %xmm2
+; SSE41-NEXT:    pminuw %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    pslld $31, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pslld $31, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, (%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpminuw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpminuw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnleuw %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
+  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
+  %res = sext <8 x i1> %obit to <8 x i32>
+  store <8 x i16> %val, <8 x i16>* %p2
+  ret <8 x i32> %res
+}
+
+define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
+; SSE-LABEL: usubo_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    psubq %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpnleuq %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    retq
+  %t = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
+  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i64> %val, <2 x i64>* %p2
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
+; SSE2-LABEL: usubo_v4i24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    movw %ax, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %ecx
+; SSE2-NEXT:    movw %cx, 9(%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, %edx
+; SSE2-NEXT:    movw %dx, 6(%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSE2-NEXT:    movd %xmm1, %esi
+; SSE2-NEXT:    movw %si, 3(%rdi)
+; SSE2-NEXT:    shrl $16, %eax
+; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    shrl $16, %ecx
+; SSE2-NEXT:    movb %cl, 11(%rdi)
+; SSE2-NEXT:    shrl $16, %edx
+; SSE2-NEXT:    movb %dl, 8(%rdi)
+; SSE2-NEXT:    shrl $16, %esi
+; SSE2-NEXT:    movb %sil, 5(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: usubo_v4i24:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm0
+; SSSE3-NEXT:    movd %xmm2, %eax
+; SSSE3-NEXT:    movw %ax, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %ecx
+; SSSE3-NEXT:    movw %cx, 9(%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, %edx
+; SSSE3-NEXT:    movw %dx, 6(%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSSE3-NEXT:    movd %xmm1, %esi
+; SSSE3-NEXT:    movw %si, 3(%rdi)
+; SSSE3-NEXT:    shrl $16, %eax
+; SSSE3-NEXT:    movb %al, 2(%rdi)
+; SSSE3-NEXT:    shrl $16, %ecx
+; SSSE3-NEXT:    movb %cl, 11(%rdi)
+; SSSE3-NEXT:    shrl $16, %edx
+; SSSE3-NEXT:    movb %dl, 8(%rdi)
+; SSSE3-NEXT:    shrl $16, %esi
+; SSSE3-NEXT:    movb %sil, 5(%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: usubo_v4i24:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    psubd %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pextrd $3, %xmm0, %eax
+; SSE41-NEXT:    movw %ax, 9(%rdi)
+; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
+; SSE41-NEXT:    movw %cx, 6(%rdi)
+; SSE41-NEXT:    pextrd $1, %xmm0, %edx
+; SSE41-NEXT:    movw %dx, 3(%rdi)
+; SSE41-NEXT:    movd %xmm0, %esi
+; SSE41-NEXT:    movw %si, (%rdi)
+; SSE41-NEXT:    shrl $16, %eax
+; SSE41-NEXT:    movb %al, 11(%rdi)
+; SSE41-NEXT:    shrl $16, %ecx
+; SSE41-NEXT:    movb %cl, 8(%rdi)
+; SSE41-NEXT:    shrl $16, %edx
+; SSE41-NEXT:    movb %dl, 5(%rdi)
+; SSE41-NEXT:    shrl $16, %esi
+; SSE41-NEXT:    movb %sil, 2(%rdi)
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v4i24:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2.35098856E-38,2.35098856E-38,2.35098856E-38,2.35098856E-38]
+; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX1-NEXT:    movw %ax, 9(%rdi)
+; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT:    movw %cx, 6(%rdi)
+; AVX1-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX1-NEXT:    movw %dx, 3(%rdi)
+; AVX1-NEXT:    vmovd %xmm1, %esi
+; AVX1-NEXT:    movw %si, (%rdi)
+; AVX1-NEXT:    shrl $16, %eax
+; AVX1-NEXT:    movb %al, 11(%rdi)
+; AVX1-NEXT:    shrl $16, %ecx
+; AVX1-NEXT:    movb %cl, 8(%rdi)
+; AVX1-NEXT:    shrl $16, %edx
+; AVX1-NEXT:    movb %dl, 5(%rdi)
+; AVX1-NEXT:    shrl $16, %esi
+; AVX1-NEXT:    movb %sil, 2(%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v4i24:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX2-NEXT:    movw %ax, 9(%rdi)
+; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT:    movw %cx, 6(%rdi)
+; AVX2-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX2-NEXT:    movw %dx, 3(%rdi)
+; AVX2-NEXT:    vmovd %xmm1, %esi
+; AVX2-NEXT:    movw %si, (%rdi)
+; AVX2-NEXT:    shrl $16, %eax
+; AVX2-NEXT:    movb %al, 11(%rdi)
+; AVX2-NEXT:    shrl $16, %ecx
+; AVX2-NEXT:    movb %cl, 8(%rdi)
+; AVX2-NEXT:    shrl $16, %edx
+; AVX2-NEXT:    movb %dl, 5(%rdi)
+; AVX2-NEXT:    shrl $16, %esi
+; AVX2-NEXT:    movb %sil, 2(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v4i24:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
+; AVX512-NEXT:    movw %ax, 9(%rdi)
+; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX512-NEXT:    movw %cx, 6(%rdi)
+; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
+; AVX512-NEXT:    movw %dx, 3(%rdi)
+; AVX512-NEXT:    vmovd %xmm1, %esi
+; AVX512-NEXT:    movw %si, (%rdi)
+; AVX512-NEXT:    shrl $16, %eax
+; AVX512-NEXT:    movb %al, 11(%rdi)
+; AVX512-NEXT:    shrl $16, %ecx
+; AVX512-NEXT:    movb %cl, 8(%rdi)
+; AVX512-NEXT:    shrl $16, %edx
+; AVX512-NEXT:    movb %dl, 5(%rdi)
+; AVX512-NEXT:    shrl $16, %esi
+; AVX512-NEXT:    movb %sil, 2(%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
+  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i24> %val, <4 x i24>* %p2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
+; SSE-LABEL: usubo_v4i1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pslld $31, %xmm0
+; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    movb %al, (%rdi)
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v4i1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vmovmskps %xmm1, %eax
+; AVX1-NEXT:    movb %al, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v4i1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vmovmskps %xmm1, %eax
+; AVX2-NEXT:    movb %al, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v4i1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k2 {%k1}
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k2} {z}
+; AVX512-NEXT:    kmovd %k1, %eax
+; AVX512-NEXT:    movb %al, (%rdi)
+; AVX512-NEXT:    retq
+  %t = call {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
+  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
+  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
+  %res = sext <4 x i1> %obit to <4 x i32>
+  store <4 x i1> %val, <4 x i1>* %p2
+  ret <4 x i32> %res
+}
+
+define <2 x i32> @usubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
+; SSE2-LABEL: usubo_v2i128:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    setb %al
+; SSE2-NEXT:    movzbl %al, %r11d
+; SSE2-NEXT:    subq %r8, %rdi
+; SSE2-NEXT:    sbbq %r9, %rsi
+; SSE2-NEXT:    setb %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pinsrw $4, %r11d, %xmm0
+; SSE2-NEXT:    movq %rdx, 16(%r10)
+; SSE2-NEXT:    movq %rdi, (%r10)
+; SSE2-NEXT:    movq %rcx, 24(%r10)
+; SSE2-NEXT:    movq %rsi, 8(%r10)
+; SSE2-NEXT:    psllq $63, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: usubo_v2i128:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; SSSE3-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT:    setb %al
+; SSSE3-NEXT:    movzbl %al, %r11d
+; SSSE3-NEXT:    subq %r8, %rdi
+; SSSE3-NEXT:    sbbq %r9, %rsi
+; SSSE3-NEXT:    setb %al
+; SSSE3-NEXT:    movzbl %al, %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    pinsrw $4, %r11d, %xmm0
+; SSSE3-NEXT:    movq %rdx, 16(%r10)
+; SSSE3-NEXT:    movq %rdi, (%r10)
+; SSSE3-NEXT:    movq %rcx, 24(%r10)
+; SSSE3-NEXT:    movq %rsi, 8(%r10)
+; SSSE3-NEXT:    psllq $63, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: usubo_v2i128:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT:    setb %al
+; SSE41-NEXT:    movzbl %al, %r11d
+; SSE41-NEXT:    subq %r8, %rdi
+; SSE41-NEXT:    sbbq %r9, %rsi
+; SSE41-NEXT:    setb %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    pinsrb $8, %r11d, %xmm0
+; SSE41-NEXT:    movq %rdx, 16(%r10)
+; SSE41-NEXT:    movq %rdi, (%r10)
+; SSE41-NEXT:    movq %rcx, 24(%r10)
+; SSE41-NEXT:    movq %rsi, 8(%r10)
+; SSE41-NEXT:    psllq $63, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: usubo_v2i128:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; AVX1-NEXT:    setb %al
+; AVX1-NEXT:    movzbl %al, %r11d
+; AVX1-NEXT:    subq %r8, %rdi
+; AVX1-NEXT:    sbbq %r9, %rsi
+; AVX1-NEXT:    setb %al
+; AVX1-NEXT:    movzbl %al, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rdx, 16(%r10)
+; AVX1-NEXT:    movq %rdi, (%r10)
+; AVX1-NEXT:    movq %rcx, 24(%r10)
+; AVX1-NEXT:    movq %rsi, 8(%r10)
+; AVX1-NEXT:    vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v2i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT:    setb %al
+; AVX2-NEXT:    movzbl %al, %r11d
+; AVX2-NEXT:    subq %r8, %rdi
+; AVX2-NEXT:    sbbq %r9, %rsi
+; AVX2-NEXT:    setb %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rdx, 16(%r10)
+; AVX2-NEXT:    movq %rdi, (%r10)
+; AVX2-NEXT:    movq %rcx, 24(%r10)
+; AVX2-NEXT:    movq %rsi, 8(%r10)
+; AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: usubo_v2i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    subq %r8, %rdi
+; AVX512-NEXT:    sbbq %r9, %rsi
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1
+; AVX512-NEXT:    movq %rdx, 16(%r10)
+; AVX512-NEXT:    movq %rdi, (%r10)
+; AVX512-NEXT:    movq %rcx, 24(%r10)
+; AVX512-NEXT:    movq %rsi, 8(%r10)
+; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    retq
+  %t = call {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
+  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
+  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
+  %res = sext <2 x i1> %obit to <2 x i32>
+  store <2 x i128> %val, <2 x i128>* %p2
+  ret <2 x i32> %res
+}