[ARM] Lower sadd_sat to qadd8 and qadd16

Lower the target independent signed saturating intrinsics to qadd8 and qadd16.
This custom lowers them from a sadd_sat, catching the node early before it is
promoted. It also adds a QADD8b and QADD16b node to mean the bottom "lane" of a
qadd8/qadd16, so that we can call demand bits on it to show that it does not
use the upper bits.

Also handles QSUB8 and QSUB16.

Differential Revision: https://reviews.llvm.org/D68974


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@375402 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e9e3c66..8fac243 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1021,6 +1021,12 @@
 
   setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
   setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
+  if (Subtarget->hasDSP()) {
+    setOperationAction(ISD::SADDSAT, MVT::i8, Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
+    setOperationAction(ISD::SADDSAT, MVT::i16, Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
+  }
 
   // i64 operation support.
   setOperationAction(ISD::MUL,     MVT::i64, Expand);
@@ -1622,6 +1628,10 @@
   case ARMISD::SMLSLDX:       return "ARMISD::SMLSLDX";
   case ARMISD::SMMLAR:        return "ARMISD::SMMLAR";
   case ARMISD::SMMLSR:        return "ARMISD::SMMLSR";
+  case ARMISD::QADD16b:       return "ARMISD::QADD16b";
+  case ARMISD::QSUB16b:       return "ARMISD::QSUB16b";
+  case ARMISD::QADD8b:        return "ARMISD::QADD8b";
+  case ARMISD::QSUB8b:        return "ARMISD::QSUB8b";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
@@ -4445,6 +4455,35 @@
   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
 }
 
+static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG,
+                               const ARMSubtarget *Subtarget) {
+  EVT VT = Op.getValueType();
+  if (!Subtarget->hasDSP())
+    return SDValue();
+  if (!VT.isSimple())
+    return SDValue();
+
+  unsigned NewOpcode;
+  bool IsAdd = Op->getOpcode() == ISD::SADDSAT;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::i8:
+    NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b;
+    break;
+  case MVT::i16:
+    NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b;
+    break;
+  }
+
+  SDLoc dl(Op);
+  SDValue Add =
+      DAG.getNode(NewOpcode, dl, MVT::i32,
+                  DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
+                  DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
+  return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
+}
+
 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue SelectTrue = Op.getOperand(1);
@@ -9121,6 +9160,9 @@
   case ISD::UADDO:
   case ISD::USUBO:
     return LowerUnsignedALUO(Op, DAG);
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:
+    return LowerSADDSUBSAT(Op, DAG, Subtarget);
   case ISD::LOAD:
     return LowerPredicateLoad(Op, DAG);
   case ISD::STORE:
@@ -9205,6 +9247,10 @@
     Results.push_back(Res.getValue(0));
     Results.push_back(Res.getValue(1));
     return;
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:
+    Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
+    break;
   case ISD::READCYCLECOUNTER:
     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
     return;
@@ -14382,7 +14428,9 @@
       return SDValue();
     break;
   }
-  case ARMISD::SMLALBB: {
+  case ARMISD::SMLALBB:
+  case ARMISD::QADD16b:
+  case ARMISD::QSUB16b: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
@@ -14418,6 +14466,15 @@
       return SDValue();
     break;
   }
+  case ARMISD::QADD8b:
+  case ARMISD::QSUB8b: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
+    if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
+      return SDValue();
+    break;
+  }
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 98e0684..82f1d2e 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -219,6 +219,12 @@
       SMMLAR,       // Signed multiply long, round and add
       SMMLSR,       // Signed multiply long, subtract and round
 
+      // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b stands for.
+      QADD8b,
+      QSUB8b,
+      QADD16b,
+      QSUB16b,
+
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
       // operations, but for ARM some BUILD_VECTORs are legal as-is and their
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index f753436..61e7f3d 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -238,6 +238,11 @@
 def ARMsmlaltb      : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>;
 def ARMsmlaltt      : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>;
 
+def ARMqadd8b       : SDNode<"ARMISD::QADD8b", SDT_ARMAnd, []>;
+def ARMqsub8b       : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>;
+def ARMqadd16b      : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>;
+def ARMqsub16b      : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>;
+
 // Vector operations shared between NEON and MVE
 
 def ARMvdup      : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
@@ -3750,6 +3755,15 @@
                 [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))]>;
 }
 
+def : ARMV6Pat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn),
+               (QADD8 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn),
+               (QSUB8 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn),
+               (QADD16 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn),
+               (QSUB16 rGPR:$Rm, rGPR:$Rn)>;
+
 def UQADD16 : AAIIntrinsic<0b01100110, 0b11110001, "uqadd16", int_arm_uqadd16>;
 def UQADD8  : AAIIntrinsic<0b01100110, 0b11111001, "uqadd8", int_arm_uqadd8>;
 def UQSUB16 : AAIIntrinsic<0b01100110, 0b11110111, "uqsub16", int_arm_uqsub16>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index ef5d090..4fb98fb 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -2395,6 +2395,15 @@
 def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)),
                    (t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
 
+def : Thumb2DSPPat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn),
+                   (t2QADD8 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn),
+                   (t2QSUB8 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn),
+                   (t2QADD16 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn),
+                   (t2QSUB16 rGPR:$Rm, rGPR:$Rn)>;
+
 // Signed/Unsigned add/subtract
 
 def t2SASX    : T2I_pam_intrinsics<0b010, 0b0000, "sasx", int_arm_sasx>;
diff --git a/test/CodeGen/ARM/sadd_sat.ll b/test/CodeGen/ARM/sadd_sat.ll
index 539f3af..1fcc460 100644
--- a/test/CodeGen/ARM/sadd_sat.ll
+++ b/test/CodeGen/ARM/sadd_sat.ll
@@ -233,35 +233,63 @@
 ; CHECK-T1-NEXT:  .LCPI2_1:
 ; CHECK-T1-NEXT:    .long 4294934528 @ 0xffff8000
 ;
-; CHECK-T2-LABEL: func16:
-; CHECK-T2:       @ %bb.0:
-; CHECK-T2-NEXT:    add r0, r1
-; CHECK-T2-NEXT:    movw r1, #32767
-; CHECK-T2-NEXT:    cmp r0, r1
-; CHECK-T2-NEXT:    it lt
-; CHECK-T2-NEXT:    movlt r1, r0
-; CHECK-T2-NEXT:    movw r0, #32768
-; CHECK-T2-NEXT:    cmn.w r1, #32768
-; CHECK-T2-NEXT:    movt r0, #65535
-; CHECK-T2-NEXT:    it gt
-; CHECK-T2-NEXT:    movgt r0, r1
-; CHECK-T2-NEXT:    bx lr
+; CHECK-T2NODSP-LABEL: func16:
+; CHECK-T2NODSP:       @ %bb.0:
+; CHECK-T2NODSP-NEXT:    add r0, r1
+; CHECK-T2NODSP-NEXT:    movw r1, #32767
+; CHECK-T2NODSP-NEXT:    cmp r0, r1
+; CHECK-T2NODSP-NEXT:    it lt
+; CHECK-T2NODSP-NEXT:    movlt r1, r0
+; CHECK-T2NODSP-NEXT:    movw r0, #32768
+; CHECK-T2NODSP-NEXT:    cmn.w r1, #32768
+; CHECK-T2NODSP-NEXT:    movt r0, #65535
+; CHECK-T2NODSP-NEXT:    it gt
+; CHECK-T2NODSP-NEXT:    movgt r0, r1
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
-; CHECK-ARM-LABEL: func16:
-; CHECK-ARM:       @ %bb.0:
-; CHECK-ARM-NEXT:    add r0, r0, r1
-; CHECK-ARM-NEXT:    mov r1, #255
-; CHECK-ARM-NEXT:    orr r1, r1, #32512
-; CHECK-ARM-NEXT:    cmp r0, r1
-; CHECK-ARM-NEXT:    movlt r1, r0
-; CHECK-ARM-NEXT:    ldr r0, .LCPI2_0
-; CHECK-ARM-NEXT:    cmn r1, #32768
-; CHECK-ARM-NEXT:    movgt r0, r1
-; CHECK-ARM-NEXT:    bx lr
-; CHECK-ARM-NEXT:    .p2align 2
-; CHECK-ARM-NEXT:  @ %bb.1:
-; CHECK-ARM-NEXT:  .LCPI2_0:
-; CHECK-ARM-NEXT:    .long 4294934528 @ 0xffff8000
+; CHECK-T2DSP-LABEL: func16:
+; CHECK-T2DSP:       @ %bb.0:
+; CHECK-T2DSP-NEXT:    qadd16 r0, r0, r1
+; CHECK-T2DSP-NEXT:    sxth r0, r0
+; CHECK-T2DSP-NEXT:    bx lr
+;
+; CHECK-ARMNODPS-LABEL: func16:
+; CHECK-ARMNODPS:       @ %bb.0:
+; CHECK-ARMNODPS-NEXT:    add r0, r0, r1
+; CHECK-ARMNODPS-NEXT:    mov r1, #255
+; CHECK-ARMNODPS-NEXT:    orr r1, r1, #32512
+; CHECK-ARMNODPS-NEXT:    cmp r0, r1
+; CHECK-ARMNODPS-NEXT:    movlt r1, r0
+; CHECK-ARMNODPS-NEXT:    ldr r0, .LCPI2_0
+; CHECK-ARMNODPS-NEXT:    cmn r1, #32768
+; CHECK-ARMNODPS-NEXT:    movgt r0, r1
+; CHECK-ARMNODPS-NEXT:    bx lr
+; CHECK-ARMNODPS-NEXT:    .p2align 2
+; CHECK-ARMNODPS-NEXT:  @ %bb.1:
+; CHECK-ARMNODPS-NEXT:  .LCPI2_0:
+; CHECK-ARMNODPS-NEXT:    .long 4294934528 @ 0xffff8000
+;
+; CHECK-ARMBASEDSP-LABEL: func16:
+; CHECK-ARMBASEDSP:       @ %bb.0:
+; CHECK-ARMBASEDSP-NEXT:    add r0, r0, r1
+; CHECK-ARMBASEDSP-NEXT:    mov r1, #255
+; CHECK-ARMBASEDSP-NEXT:    orr r1, r1, #32512
+; CHECK-ARMBASEDSP-NEXT:    cmp r0, r1
+; CHECK-ARMBASEDSP-NEXT:    movlt r1, r0
+; CHECK-ARMBASEDSP-NEXT:    ldr r0, .LCPI2_0
+; CHECK-ARMBASEDSP-NEXT:    cmn r1, #32768
+; CHECK-ARMBASEDSP-NEXT:    movgt r0, r1
+; CHECK-ARMBASEDSP-NEXT:    bx lr
+; CHECK-ARMBASEDSP-NEXT:    .p2align 2
+; CHECK-ARMBASEDSP-NEXT:  @ %bb.1:
+; CHECK-ARMBASEDSP-NEXT:  .LCPI2_0:
+; CHECK-ARMBASEDSP-NEXT:    .long 4294934528 @ 0xffff8000
+;
+; CHECK-ARMDSP-LABEL: func16:
+; CHECK-ARMDSP:       @ %bb.0:
+; CHECK-ARMDSP-NEXT:    qadd16 r0, r0, r1
+; CHECK-ARMDSP-NEXT:    sxth r0, r0
+; CHECK-ARMDSP-NEXT:    bx lr
   %tmp = call i16 @llvm.sadd.sat.i16(i16 %x, i16 %y)
   ret i16 %tmp
 }
@@ -284,25 +312,46 @@
 ; CHECK-T1-NEXT:  .LBB3_4:
 ; CHECK-T1-NEXT:    bx lr
 ;
-; CHECK-T2-LABEL: func8:
-; CHECK-T2:       @ %bb.0:
-; CHECK-T2-NEXT:    add r0, r1
-; CHECK-T2-NEXT:    cmp r0, #127
-; CHECK-T2-NEXT:    it ge
-; CHECK-T2-NEXT:    movge r0, #127
-; CHECK-T2-NEXT:    cmn.w r0, #128
-; CHECK-T2-NEXT:    it le
-; CHECK-T2-NEXT:    mvnle r0, #127
-; CHECK-T2-NEXT:    bx lr
+; CHECK-T2NODSP-LABEL: func8:
+; CHECK-T2NODSP:       @ %bb.0:
+; CHECK-T2NODSP-NEXT:    add r0, r1
+; CHECK-T2NODSP-NEXT:    cmp r0, #127
+; CHECK-T2NODSP-NEXT:    it ge
+; CHECK-T2NODSP-NEXT:    movge r0, #127
+; CHECK-T2NODSP-NEXT:    cmn.w r0, #128
+; CHECK-T2NODSP-NEXT:    it le
+; CHECK-T2NODSP-NEXT:    mvnle r0, #127
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
-; CHECK-ARM-LABEL: func8:
-; CHECK-ARM:       @ %bb.0:
-; CHECK-ARM-NEXT:    add r0, r0, r1
-; CHECK-ARM-NEXT:    cmp r0, #127
-; CHECK-ARM-NEXT:    movge r0, #127
-; CHECK-ARM-NEXT:    cmn r0, #128
-; CHECK-ARM-NEXT:    mvnle r0, #127
-; CHECK-ARM-NEXT:    bx lr
+; CHECK-T2DSP-LABEL: func8:
+; CHECK-T2DSP:       @ %bb.0:
+; CHECK-T2DSP-NEXT:    qadd8 r0, r0, r1
+; CHECK-T2DSP-NEXT:    sxtb r0, r0
+; CHECK-T2DSP-NEXT:    bx lr
+;
+; CHECK-ARMNODPS-LABEL: func8:
+; CHECK-ARMNODPS:       @ %bb.0:
+; CHECK-ARMNODPS-NEXT:    add r0, r0, r1
+; CHECK-ARMNODPS-NEXT:    cmp r0, #127
+; CHECK-ARMNODPS-NEXT:    movge r0, #127
+; CHECK-ARMNODPS-NEXT:    cmn r0, #128
+; CHECK-ARMNODPS-NEXT:    mvnle r0, #127
+; CHECK-ARMNODPS-NEXT:    bx lr
+;
+; CHECK-ARMBASEDSP-LABEL: func8:
+; CHECK-ARMBASEDSP:       @ %bb.0:
+; CHECK-ARMBASEDSP-NEXT:    add r0, r0, r1
+; CHECK-ARMBASEDSP-NEXT:    cmp r0, #127
+; CHECK-ARMBASEDSP-NEXT:    movge r0, #127
+; CHECK-ARMBASEDSP-NEXT:    cmn r0, #128
+; CHECK-ARMBASEDSP-NEXT:    mvnle r0, #127
+; CHECK-ARMBASEDSP-NEXT:    bx lr
+;
+; CHECK-ARMDSP-LABEL: func8:
+; CHECK-ARMDSP:       @ %bb.0:
+; CHECK-ARMDSP-NEXT:    qadd8 r0, r0, r1
+; CHECK-ARMDSP-NEXT:    sxtb r0, r0
+; CHECK-ARMDSP-NEXT:    bx lr
   %tmp = call i8 @llvm.sadd.sat.i8(i8 %x, i8 %y)
   ret i8 %tmp
 }
diff --git a/test/CodeGen/ARM/sadd_sat_plus.ll b/test/CodeGen/ARM/sadd_sat_plus.ll
index 94aca12..370d5d9 100644
--- a/test/CodeGen/ARM/sadd_sat_plus.ll
+++ b/test/CodeGen/ARM/sadd_sat_plus.ll
@@ -258,29 +258,15 @@
 ; CHECK-T2DSP-LABEL: func16:
 ; CHECK-T2DSP:       @ %bb.0:
 ; CHECK-T2DSP-NEXT:    muls r1, r2, r1
-; CHECK-T2DSP-NEXT:    sxtah r0, r0, r1
-; CHECK-T2DSP-NEXT:    movw r1, #32767
-; CHECK-T2DSP-NEXT:    cmp r0, r1
-; CHECK-T2DSP-NEXT:    it lt
-; CHECK-T2DSP-NEXT:    movlt r1, r0
-; CHECK-T2DSP-NEXT:    movw r0, #32768
-; CHECK-T2DSP-NEXT:    cmn.w r1, #32768
-; CHECK-T2DSP-NEXT:    movt r0, #65535
-; CHECK-T2DSP-NEXT:    it gt
-; CHECK-T2DSP-NEXT:    movgt r0, r1
+; CHECK-T2DSP-NEXT:    qadd16 r0, r0, r1
+; CHECK-T2DSP-NEXT:    sxth r0, r0
 ; CHECK-T2DSP-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func16:
 ; CHECK-ARM:       @ %bb.0:
 ; CHECK-ARM-NEXT:    smulbb r1, r1, r2
-; CHECK-ARM-NEXT:    sxtah r0, r0, r1
-; CHECK-ARM-NEXT:    movw r1, #32767
-; CHECK-ARM-NEXT:    cmp r0, r1
-; CHECK-ARM-NEXT:    movlt r1, r0
-; CHECK-ARM-NEXT:    movw r0, #32768
-; CHECK-ARM-NEXT:    movt r0, #65535
-; CHECK-ARM-NEXT:    cmn r1, #32768
-; CHECK-ARM-NEXT:    movgt r0, r1
+; CHECK-ARM-NEXT:    qadd16 r0, r0, r1
+; CHECK-ARM-NEXT:    sxth r0, r0
 ; CHECK-ARM-NEXT:    bx lr
   %a = mul i16 %y, %z
   %tmp = call i16 @llvm.sadd.sat.i16(i16 %x, i16 %a)
@@ -323,23 +309,15 @@
 ; CHECK-T2DSP-LABEL: func8:
 ; CHECK-T2DSP:       @ %bb.0:
 ; CHECK-T2DSP-NEXT:    muls r1, r2, r1
-; CHECK-T2DSP-NEXT:    sxtab r0, r0, r1
-; CHECK-T2DSP-NEXT:    cmp r0, #127
-; CHECK-T2DSP-NEXT:    it ge
-; CHECK-T2DSP-NEXT:    movge r0, #127
-; CHECK-T2DSP-NEXT:    cmn.w r0, #128
-; CHECK-T2DSP-NEXT:    it le
-; CHECK-T2DSP-NEXT:    mvnle r0, #127
+; CHECK-T2DSP-NEXT:    qadd8 r0, r0, r1
+; CHECK-T2DSP-NEXT:    sxtb r0, r0
 ; CHECK-T2DSP-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func8:
 ; CHECK-ARM:       @ %bb.0:
 ; CHECK-ARM-NEXT:    smulbb r1, r1, r2
-; CHECK-ARM-NEXT:    sxtab r0, r0, r1
-; CHECK-ARM-NEXT:    cmp r0, #127
-; CHECK-ARM-NEXT:    movge r0, #127
-; CHECK-ARM-NEXT:    cmn r0, #128
-; CHECK-ARM-NEXT:    mvnle r0, #127
+; CHECK-ARM-NEXT:    qadd8 r0, r0, r1
+; CHECK-ARM-NEXT:    sxtb r0, r0
 ; CHECK-ARM-NEXT:    bx lr
   %a = mul i8 %y, %z
   %tmp = call i8 @llvm.sadd.sat.i8(i8 %x, i8 %a)
diff --git a/test/CodeGen/ARM/ssub_sat.ll b/test/CodeGen/ARM/ssub_sat.ll
index 6efb34d..0c9e125 100644
--- a/test/CodeGen/ARM/ssub_sat.ll
+++ b/test/CodeGen/ARM/ssub_sat.ll
@@ -235,35 +235,63 @@
 ; CHECK-T1-NEXT:  .LCPI2_1:
 ; CHECK-T1-NEXT:    .long 4294934528 @ 0xffff8000
 ;
-; CHECK-T2-LABEL: func16:
-; CHECK-T2:       @ %bb.0:
-; CHECK-T2-NEXT:    subs r0, r0, r1
-; CHECK-T2-NEXT:    movw r1, #32767
-; CHECK-T2-NEXT:    cmp r0, r1
-; CHECK-T2-NEXT:    it lt
-; CHECK-T2-NEXT:    movlt r1, r0
-; CHECK-T2-NEXT:    movw r0, #32768
-; CHECK-T2-NEXT:    cmn.w r1, #32768
-; CHECK-T2-NEXT:    movt r0, #65535
-; CHECK-T2-NEXT:    it gt
-; CHECK-T2-NEXT:    movgt r0, r1
-; CHECK-T2-NEXT:    bx lr
+; CHECK-T2NODSP-LABEL: func16:
+; CHECK-T2NODSP:       @ %bb.0:
+; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
+; CHECK-T2NODSP-NEXT:    movw r1, #32767
+; CHECK-T2NODSP-NEXT:    cmp r0, r1
+; CHECK-T2NODSP-NEXT:    it lt
+; CHECK-T2NODSP-NEXT:    movlt r1, r0
+; CHECK-T2NODSP-NEXT:    movw r0, #32768
+; CHECK-T2NODSP-NEXT:    cmn.w r1, #32768
+; CHECK-T2NODSP-NEXT:    movt r0, #65535
+; CHECK-T2NODSP-NEXT:    it gt
+; CHECK-T2NODSP-NEXT:    movgt r0, r1
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
-; CHECK-ARM-LABEL: func16:
-; CHECK-ARM:       @ %bb.0:
-; CHECK-ARM-NEXT:    sub r0, r0, r1
-; CHECK-ARM-NEXT:    mov r1, #255
-; CHECK-ARM-NEXT:    orr r1, r1, #32512
-; CHECK-ARM-NEXT:    cmp r0, r1
-; CHECK-ARM-NEXT:    movlt r1, r0
-; CHECK-ARM-NEXT:    ldr r0, .LCPI2_0
-; CHECK-ARM-NEXT:    cmn r1, #32768
-; CHECK-ARM-NEXT:    movgt r0, r1
-; CHECK-ARM-NEXT:    bx lr
-; CHECK-ARM-NEXT:    .p2align 2
-; CHECK-ARM-NEXT:  @ %bb.1:
-; CHECK-ARM-NEXT:  .LCPI2_0:
-; CHECK-ARM-NEXT:    .long 4294934528 @ 0xffff8000
+; CHECK-T2DSP-LABEL: func16:
+; CHECK-T2DSP:       @ %bb.0:
+; CHECK-T2DSP-NEXT:    qsub16 r0, r0, r1
+; CHECK-T2DSP-NEXT:    sxth r0, r0
+; CHECK-T2DSP-NEXT:    bx lr
+;
+; CHECK-ARMNODPS-LABEL: func16:
+; CHECK-ARMNODPS:       @ %bb.0:
+; CHECK-ARMNODPS-NEXT:    sub r0, r0, r1
+; CHECK-ARMNODPS-NEXT:    mov r1, #255
+; CHECK-ARMNODPS-NEXT:    orr r1, r1, #32512
+; CHECK-ARMNODPS-NEXT:    cmp r0, r1
+; CHECK-ARMNODPS-NEXT:    movlt r1, r0
+; CHECK-ARMNODPS-NEXT:    ldr r0, .LCPI2_0
+; CHECK-ARMNODPS-NEXT:    cmn r1, #32768
+; CHECK-ARMNODPS-NEXT:    movgt r0, r1
+; CHECK-ARMNODPS-NEXT:    bx lr
+; CHECK-ARMNODPS-NEXT:    .p2align 2
+; CHECK-ARMNODPS-NEXT:  @ %bb.1:
+; CHECK-ARMNODPS-NEXT:  .LCPI2_0:
+; CHECK-ARMNODPS-NEXT:    .long 4294934528 @ 0xffff8000
+;
+; CHECK-ARMBASEDSP-LABEL: func16:
+; CHECK-ARMBASEDSP:       @ %bb.0:
+; CHECK-ARMBASEDSP-NEXT:    sub r0, r0, r1
+; CHECK-ARMBASEDSP-NEXT:    mov r1, #255
+; CHECK-ARMBASEDSP-NEXT:    orr r1, r1, #32512
+; CHECK-ARMBASEDSP-NEXT:    cmp r0, r1
+; CHECK-ARMBASEDSP-NEXT:    movlt r1, r0
+; CHECK-ARMBASEDSP-NEXT:    ldr r0, .LCPI2_0
+; CHECK-ARMBASEDSP-NEXT:    cmn r1, #32768
+; CHECK-ARMBASEDSP-NEXT:    movgt r0, r1
+; CHECK-ARMBASEDSP-NEXT:    bx lr
+; CHECK-ARMBASEDSP-NEXT:    .p2align 2
+; CHECK-ARMBASEDSP-NEXT:  @ %bb.1:
+; CHECK-ARMBASEDSP-NEXT:  .LCPI2_0:
+; CHECK-ARMBASEDSP-NEXT:    .long 4294934528 @ 0xffff8000
+;
+; CHECK-ARMDSP-LABEL: func16:
+; CHECK-ARMDSP:       @ %bb.0:
+; CHECK-ARMDSP-NEXT:    qsub16 r0, r0, r1
+; CHECK-ARMDSP-NEXT:    sxth r0, r0
+; CHECK-ARMDSP-NEXT:    bx lr
   %tmp = call i16 @llvm.ssub.sat.i16(i16 %x, i16 %y)
   ret i16 %tmp
 }
@@ -286,25 +314,46 @@
 ; CHECK-T1-NEXT:  .LBB3_4:
 ; CHECK-T1-NEXT:    bx lr
 ;
-; CHECK-T2-LABEL: func8:
-; CHECK-T2:       @ %bb.0:
-; CHECK-T2-NEXT:    subs r0, r0, r1
-; CHECK-T2-NEXT:    cmp r0, #127
-; CHECK-T2-NEXT:    it ge
-; CHECK-T2-NEXT:    movge r0, #127
-; CHECK-T2-NEXT:    cmn.w r0, #128
-; CHECK-T2-NEXT:    it le
-; CHECK-T2-NEXT:    mvnle r0, #127
-; CHECK-T2-NEXT:    bx lr
+; CHECK-T2NODSP-LABEL: func8:
+; CHECK-T2NODSP:       @ %bb.0:
+; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
+; CHECK-T2NODSP-NEXT:    cmp r0, #127
+; CHECK-T2NODSP-NEXT:    it ge
+; CHECK-T2NODSP-NEXT:    movge r0, #127
+; CHECK-T2NODSP-NEXT:    cmn.w r0, #128
+; CHECK-T2NODSP-NEXT:    it le
+; CHECK-T2NODSP-NEXT:    mvnle r0, #127
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
-; CHECK-ARM-LABEL: func8:
-; CHECK-ARM:       @ %bb.0:
-; CHECK-ARM-NEXT:    sub r0, r0, r1
-; CHECK-ARM-NEXT:    cmp r0, #127
-; CHECK-ARM-NEXT:    movge r0, #127
-; CHECK-ARM-NEXT:    cmn r0, #128
-; CHECK-ARM-NEXT:    mvnle r0, #127
-; CHECK-ARM-NEXT:    bx lr
+; CHECK-T2DSP-LABEL: func8:
+; CHECK-T2DSP:       @ %bb.0:
+; CHECK-T2DSP-NEXT:    qsub8 r0, r0, r1
+; CHECK-T2DSP-NEXT:    sxtb r0, r0
+; CHECK-T2DSP-NEXT:    bx lr
+;
+; CHECK-ARMNODPS-LABEL: func8:
+; CHECK-ARMNODPS:       @ %bb.0:
+; CHECK-ARMNODPS-NEXT:    sub r0, r0, r1
+; CHECK-ARMNODPS-NEXT:    cmp r0, #127
+; CHECK-ARMNODPS-NEXT:    movge r0, #127
+; CHECK-ARMNODPS-NEXT:    cmn r0, #128
+; CHECK-ARMNODPS-NEXT:    mvnle r0, #127
+; CHECK-ARMNODPS-NEXT:    bx lr
+;
+; CHECK-ARMBASEDSP-LABEL: func8:
+; CHECK-ARMBASEDSP:       @ %bb.0:
+; CHECK-ARMBASEDSP-NEXT:    sub r0, r0, r1
+; CHECK-ARMBASEDSP-NEXT:    cmp r0, #127
+; CHECK-ARMBASEDSP-NEXT:    movge r0, #127
+; CHECK-ARMBASEDSP-NEXT:    cmn r0, #128
+; CHECK-ARMBASEDSP-NEXT:    mvnle r0, #127
+; CHECK-ARMBASEDSP-NEXT:    bx lr
+;
+; CHECK-ARMDSP-LABEL: func8:
+; CHECK-ARMDSP:       @ %bb.0:
+; CHECK-ARMDSP-NEXT:    qsub8 r0, r0, r1
+; CHECK-ARMDSP-NEXT:    sxtb r0, r0
+; CHECK-ARMDSP-NEXT:    bx lr
   %tmp = call i8 @llvm.ssub.sat.i8(i8 %x, i8 %y)
   ret i8 %tmp
 }
diff --git a/test/CodeGen/ARM/ssub_sat_plus.ll b/test/CodeGen/ARM/ssub_sat_plus.ll
index 9a59840..02a0de2 100644
--- a/test/CodeGen/ARM/ssub_sat_plus.ll
+++ b/test/CodeGen/ARM/ssub_sat_plus.ll
@@ -245,34 +245,34 @@
 ; CHECK-T1-NEXT:  .LCPI2_1:
 ; CHECK-T1-NEXT:    .long 4294934528 @ 0xffff8000
 ;
-; CHECK-T2-LABEL: func16:
-; CHECK-T2:       @ %bb.0:
-; CHECK-T2-NEXT:    muls r1, r2, r1
-; CHECK-T2-NEXT:    sxth r1, r1
-; CHECK-T2-NEXT:    subs r0, r0, r1
-; CHECK-T2-NEXT:    movw r1, #32767
-; CHECK-T2-NEXT:    cmp r0, r1
-; CHECK-T2-NEXT:    it lt
-; CHECK-T2-NEXT:    movlt r1, r0
-; CHECK-T2-NEXT:    movw r0, #32768
-; CHECK-T2-NEXT:    movt r0, #65535
-; CHECK-T2-NEXT:    cmn.w r1, #32768
-; CHECK-T2-NEXT:    it gt
-; CHECK-T2-NEXT:    movgt r0, r1
-; CHECK-T2-NEXT:    bx lr
+; CHECK-T2NODSP-LABEL: func16:
+; CHECK-T2NODSP:       @ %bb.0:
+; CHECK-T2NODSP-NEXT:    muls r1, r2, r1
+; CHECK-T2NODSP-NEXT:    sxth r1, r1
+; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
+; CHECK-T2NODSP-NEXT:    movw r1, #32767
+; CHECK-T2NODSP-NEXT:    cmp r0, r1
+; CHECK-T2NODSP-NEXT:    it lt
+; CHECK-T2NODSP-NEXT:    movlt r1, r0
+; CHECK-T2NODSP-NEXT:    movw r0, #32768
+; CHECK-T2NODSP-NEXT:    movt r0, #65535
+; CHECK-T2NODSP-NEXT:    cmn.w r1, #32768
+; CHECK-T2NODSP-NEXT:    it gt
+; CHECK-T2NODSP-NEXT:    movgt r0, r1
+; CHECK-T2NODSP-NEXT:    bx lr
+;
+; CHECK-T2DSP-LABEL: func16:
+; CHECK-T2DSP:       @ %bb.0:
+; CHECK-T2DSP-NEXT:    muls r1, r2, r1
+; CHECK-T2DSP-NEXT:    qsub16 r0, r0, r1
+; CHECK-T2DSP-NEXT:    sxth r0, r0
+; CHECK-T2DSP-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func16:
 ; CHECK-ARM:       @ %bb.0:
 ; CHECK-ARM-NEXT:    smulbb r1, r1, r2
-; CHECK-ARM-NEXT:    sxth r1, r1
-; CHECK-ARM-NEXT:    sub r0, r0, r1
-; CHECK-ARM-NEXT:    movw r1, #32767
-; CHECK-ARM-NEXT:    cmp r0, r1
-; CHECK-ARM-NEXT:    movlt r1, r0
-; CHECK-ARM-NEXT:    movw r0, #32768
-; CHECK-ARM-NEXT:    movt r0, #65535
-; CHECK-ARM-NEXT:    cmn r1, #32768
-; CHECK-ARM-NEXT:    movgt r0, r1
+; CHECK-ARM-NEXT:    qsub16 r0, r0, r1
+; CHECK-ARM-NEXT:    sxth r0, r0
 ; CHECK-ARM-NEXT:    bx lr
   %a = mul i16 %y, %z
   %tmp = call i16 @llvm.ssub.sat.i16(i16 %x, i16 %a)
@@ -299,28 +299,31 @@
 ; CHECK-T1-NEXT:  .LBB3_4:
 ; CHECK-T1-NEXT:    bx lr
 ;
-; CHECK-T2-LABEL: func8:
-; CHECK-T2:       @ %bb.0:
-; CHECK-T2-NEXT:    muls r1, r2, r1
-; CHECK-T2-NEXT:    sxtb r1, r1
-; CHECK-T2-NEXT:    subs r0, r0, r1
-; CHECK-T2-NEXT:    cmp r0, #127
-; CHECK-T2-NEXT:    it ge
-; CHECK-T2-NEXT:    movge r0, #127
-; CHECK-T2-NEXT:    cmn.w r0, #128
-; CHECK-T2-NEXT:    it le
-; CHECK-T2-NEXT:    mvnle r0, #127
-; CHECK-T2-NEXT:    bx lr
+; CHECK-T2NODSP-LABEL: func8:
+; CHECK-T2NODSP:       @ %bb.0:
+; CHECK-T2NODSP-NEXT:    muls r1, r2, r1
+; CHECK-T2NODSP-NEXT:    sxtb r1, r1
+; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
+; CHECK-T2NODSP-NEXT:    cmp r0, #127
+; CHECK-T2NODSP-NEXT:    it ge
+; CHECK-T2NODSP-NEXT:    movge r0, #127
+; CHECK-T2NODSP-NEXT:    cmn.w r0, #128
+; CHECK-T2NODSP-NEXT:    it le
+; CHECK-T2NODSP-NEXT:    mvnle r0, #127
+; CHECK-T2NODSP-NEXT:    bx lr
+;
+; CHECK-T2DSP-LABEL: func8:
+; CHECK-T2DSP:       @ %bb.0:
+; CHECK-T2DSP-NEXT:    muls r1, r2, r1
+; CHECK-T2DSP-NEXT:    qsub8 r0, r0, r1
+; CHECK-T2DSP-NEXT:    sxtb r0, r0
+; CHECK-T2DSP-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func8:
 ; CHECK-ARM:       @ %bb.0:
 ; CHECK-ARM-NEXT:    smulbb r1, r1, r2
-; CHECK-ARM-NEXT:    sxtb r1, r1
-; CHECK-ARM-NEXT:    sub r0, r0, r1
-; CHECK-ARM-NEXT:    cmp r0, #127
-; CHECK-ARM-NEXT:    movge r0, #127
-; CHECK-ARM-NEXT:    cmn r0, #128
-; CHECK-ARM-NEXT:    mvnle r0, #127
+; CHECK-ARM-NEXT:    qsub8 r0, r0, r1
+; CHECK-ARM-NEXT:    sxtb r0, r0
 ; CHECK-ARM-NEXT:    bx lr
   %a = mul i8 %y, %z
   %tmp = call i8 @llvm.ssub.sat.i8(i8 %x, i8 %a)