[TwoAddressInstructionPass] Create register mapping for registers with multiple uses in the current MBB

Currently we create register mappings for registers used only once in current
MBB. For registers with multiple uses, when all the uses are in the current MBB,
we can also create mappings for them similarly according to the last use.
For example

    %reg101 = ...
            = ... reg101
    %reg103 = ADD %reg101, %reg102

We can create mapping between %reg101 and %reg103.

Differential Revision: https://reviews.llvm.org/D113193
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 627f90b..dfd962b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -373,19 +373,25 @@
   return false;
 }
 
-/// Given a register, if has a single in-basic block use, return the use
-/// instruction if it's a copy or a two-address use.
+/// Given a register, if all its uses are in the same basic block, return the
+/// last use instruction if it's a copy or a two-address use.
 static MachineInstr *
 findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB,
                        MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
-                       bool &IsCopy, Register &DstReg, bool &IsDstPhys) {
-  if (!MRI->hasOneNonDBGUse(Reg))
-    // None or more than one use.
+                       bool &IsCopy, Register &DstReg, bool &IsDstPhys,
+                       LiveIntervals *LIS) {
+  MachineOperand *UseOp = nullptr;
+  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    MachineInstr *MI = MO.getParent();
+    if (MI->getParent() != MBB)
+      return nullptr;
+    if (isPlainlyKilled(MI, Reg, LIS))
+      UseOp = &MO;
+  }
+  if (!UseOp)
     return nullptr;
-  MachineOperand &UseOp = *MRI->use_nodbg_begin(Reg);
-  MachineInstr &UseMI = *UseOp.getParent();
-  if (UseMI.getParent() != MBB)
-    return nullptr;
+  MachineInstr &UseMI = *UseOp->getParent();
+
   Register SrcReg;
   bool IsSrcPhys;
   if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) {
@@ -399,7 +405,7 @@
   }
   if (UseMI.isCommutable()) {
     unsigned Src1 = TargetInstrInfo::CommuteAnyOperandIndex;
-    unsigned Src2 = UseMI.getOperandNo(&UseOp);
+    unsigned Src2 = UseMI.getOperandNo(UseOp);
     if (TII->findCommutedOpIndices(UseMI, Src1, Src2)) {
       MachineOperand &MO = UseMI.getOperand(Src1);
       if (MO.isReg() && MO.isUse() &&
@@ -722,7 +728,7 @@
   Register NewReg;
   Register Reg = DstReg;
   while (MachineInstr *UseMI = findOnlyInterestingUse(Reg, MBB, MRI, TII,IsCopy,
-                                                      NewReg, IsDstPhys)) {
+                                                      NewReg, IsDstPhys, LIS)) {
     if (IsCopy && !Processed.insert(UseMI).second)
       break;
 
diff --git a/llvm/test/CodeGen/ARM/fpclamptosat.ll b/llvm/test/CodeGen/ARM/fpclamptosat.ll
index 9d5308b..eff1755 100644
--- a/llvm/test/CodeGen/ARM/fpclamptosat.ll
+++ b/llvm/test/CodeGen/ARM/fpclamptosat.ll
@@ -57,22 +57,21 @@
 ; VFP2-NEXT:    push {r7, lr}
 ; VFP2-NEXT:    vmov r0, r1, d0
 ; VFP2-NEXT:    bl __aeabi_d2lz
-; VFP2-NEXT:    mvn r2, #-2147483648
-; VFP2-NEXT:    subs r3, r0, r2
-; VFP2-NEXT:    mov.w r12, #0
+; VFP2-NEXT:    mvn r12, #-2147483648
+; VFP2-NEXT:    subs.w r3, r0, r12
+; VFP2-NEXT:    mov.w r2, #0
 ; VFP2-NEXT:    sbcs r3, r1, #0
 ; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt.w r12, #1
-; VFP2-NEXT:    cmp.w r12, #0
-; VFP2-NEXT:    itt ne
-; VFP2-NEXT:    movne r12, r1
-; VFP2-NEXT:    movne r2, r0
-; VFP2-NEXT:    mov.w r0, #-1
-; VFP2-NEXT:    rsbs.w r1, r2, #-2147483648
-; VFP2-NEXT:    sbcs.w r0, r0, r12
+; VFP2-NEXT:    movlt r2, #1
+; VFP2-NEXT:    cmp r2, #0
+; VFP2-NEXT:    ite ne
+; VFP2-NEXT:    movne r2, r1
+; VFP2-NEXT:    moveq r0, r12
+; VFP2-NEXT:    mov.w r1, #-1
+; VFP2-NEXT:    rsbs.w r3, r0, #-2147483648
+; VFP2-NEXT:    sbcs r1, r2
 ; VFP2-NEXT:    it ge
-; VFP2-NEXT:    movge.w r2, #-2147483648
-; VFP2-NEXT:    mov r0, r2
+; VFP2-NEXT:    movge.w r0, #-2147483648
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: stest_f64i32:
@@ -293,22 +292,21 @@
 ; VFP2-NEXT:    push {r7, lr}
 ; VFP2-NEXT:    vmov r0, s0
 ; VFP2-NEXT:    bl __aeabi_f2lz
-; VFP2-NEXT:    mvn r2, #-2147483648
-; VFP2-NEXT:    subs r3, r0, r2
-; VFP2-NEXT:    mov.w r12, #0
+; VFP2-NEXT:    mvn r12, #-2147483648
+; VFP2-NEXT:    subs.w r3, r0, r12
+; VFP2-NEXT:    mov.w r2, #0
 ; VFP2-NEXT:    sbcs r3, r1, #0
 ; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt.w r12, #1
-; VFP2-NEXT:    cmp.w r12, #0
-; VFP2-NEXT:    itt ne
-; VFP2-NEXT:    movne r12, r1
-; VFP2-NEXT:    movne r2, r0
-; VFP2-NEXT:    mov.w r0, #-1
-; VFP2-NEXT:    rsbs.w r1, r2, #-2147483648
-; VFP2-NEXT:    sbcs.w r0, r0, r12
+; VFP2-NEXT:    movlt r2, #1
+; VFP2-NEXT:    cmp r2, #0
+; VFP2-NEXT:    ite ne
+; VFP2-NEXT:    movne r2, r1
+; VFP2-NEXT:    moveq r0, r12
+; VFP2-NEXT:    mov.w r1, #-1
+; VFP2-NEXT:    rsbs.w r3, r0, #-2147483648
+; VFP2-NEXT:    sbcs r1, r2
 ; VFP2-NEXT:    it ge
-; VFP2-NEXT:    movge.w r2, #-2147483648
-; VFP2-NEXT:    mov r0, r2
+; VFP2-NEXT:    movge.w r0, #-2147483648
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: stest_f32i32:
@@ -532,22 +530,21 @@
 ; VFP2-NEXT:    vmov r0, s0
 ; VFP2-NEXT:    bl __aeabi_h2f
 ; VFP2-NEXT:    bl __aeabi_f2lz
-; VFP2-NEXT:    mvn r2, #-2147483648
-; VFP2-NEXT:    subs r3, r0, r2
-; VFP2-NEXT:    mov.w r12, #0
+; VFP2-NEXT:    mvn r12, #-2147483648
+; VFP2-NEXT:    subs.w r3, r0, r12
+; VFP2-NEXT:    mov.w r2, #0
 ; VFP2-NEXT:    sbcs r3, r1, #0
 ; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt.w r12, #1
-; VFP2-NEXT:    cmp.w r12, #0
-; VFP2-NEXT:    itt ne
-; VFP2-NEXT:    movne r12, r1
-; VFP2-NEXT:    movne r2, r0
-; VFP2-NEXT:    mov.w r0, #-1
-; VFP2-NEXT:    rsbs.w r1, r2, #-2147483648
-; VFP2-NEXT:    sbcs.w r0, r0, r12
+; VFP2-NEXT:    movlt r2, #1
+; VFP2-NEXT:    cmp r2, #0
+; VFP2-NEXT:    ite ne
+; VFP2-NEXT:    movne r2, r1
+; VFP2-NEXT:    moveq r0, r12
+; VFP2-NEXT:    mov.w r1, #-1
+; VFP2-NEXT:    rsbs.w r3, r0, #-2147483648
+; VFP2-NEXT:    sbcs r1, r2
 ; VFP2-NEXT:    it ge
-; VFP2-NEXT:    movge.w r2, #-2147483648
-; VFP2-NEXT:    mov r0, r2
+; VFP2-NEXT:    movge.w r0, #-2147483648
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: stest_f16i32:
@@ -1256,26 +1253,25 @@
 ; VFP2-NEXT:    push {r4, r5, r7, lr}
 ; VFP2-NEXT:    bl __fixdfti
 ; VFP2-NEXT:    subs.w r4, r0, #-1
-; VFP2-NEXT:    mov r12, r1
-; VFP2-NEXT:    mvn r1, #-2147483648
-; VFP2-NEXT:    sbcs.w r4, r12, r1
+; VFP2-NEXT:    mvn lr, #-2147483648
+; VFP2-NEXT:    sbcs.w r4, r1, lr
+; VFP2-NEXT:    mov.w r12, #0
 ; VFP2-NEXT:    sbcs r4, r2, #0
-; VFP2-NEXT:    mov.w lr, #0
 ; VFP2-NEXT:    sbcs r4, r3, #0
 ; VFP2-NEXT:    mov.w r4, #0
 ; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt r4, #1
 ; VFP2-NEXT:    cmp r4, #0
-; VFP2-NEXT:    itee eq
+; VFP2-NEXT:    itet eq
 ; VFP2-NEXT:    moveq r3, r4
 ; VFP2-NEXT:    movne r4, r2
-; VFP2-NEXT:    movne r1, r12
+; VFP2-NEXT:    moveq r1, lr
 ; VFP2-NEXT:    mov.w r2, #-1
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r0, r2
 ; VFP2-NEXT:    rsbs r5, r0, #0
-; VFP2-NEXT:    mov.w r12, #-2147483648
-; VFP2-NEXT:    sbcs.w r5, r12, r1
+; VFP2-NEXT:    mov.w lr, #-2147483648
+; VFP2-NEXT:    sbcs.w r5, lr, r1
 ; VFP2-NEXT:    sbcs.w r4, r2, r4
 ; VFP2-NEXT:    sbcs r2, r3
 ; VFP2-NEXT:    mov.w r2, #0
@@ -1283,8 +1279,8 @@
 ; VFP2-NEXT:    movlt r2, #1
 ; VFP2-NEXT:    cmp r2, #0
 ; VFP2-NEXT:    itt eq
-; VFP2-NEXT:    moveq r0, lr
-; VFP2-NEXT:    moveq r1, r12
+; VFP2-NEXT:    moveq r0, r12
+; VFP2-NEXT:    moveq r1, lr
 ; VFP2-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; FULL-LABEL: stest_f64i64:
@@ -1623,26 +1619,25 @@
 ; VFP2-NEXT:    push {r4, r5, r7, lr}
 ; VFP2-NEXT:    bl __fixsfti
 ; VFP2-NEXT:    subs.w r4, r0, #-1
-; VFP2-NEXT:    mov r12, r1
-; VFP2-NEXT:    mvn r1, #-2147483648
-; VFP2-NEXT:    sbcs.w r4, r12, r1
+; VFP2-NEXT:    mvn lr, #-2147483648
+; VFP2-NEXT:    sbcs.w r4, r1, lr
+; VFP2-NEXT:    mov.w r12, #0
 ; VFP2-NEXT:    sbcs r4, r2, #0
-; VFP2-NEXT:    mov.w lr, #0
 ; VFP2-NEXT:    sbcs r4, r3, #0
 ; VFP2-NEXT:    mov.w r4, #0
 ; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt r4, #1
 ; VFP2-NEXT:    cmp r4, #0
-; VFP2-NEXT:    itee eq
+; VFP2-NEXT:    itet eq
 ; VFP2-NEXT:    moveq r3, r4
 ; VFP2-NEXT:    movne r4, r2
-; VFP2-NEXT:    movne r1, r12
+; VFP2-NEXT:    moveq r1, lr
 ; VFP2-NEXT:    mov.w r2, #-1
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r0, r2
 ; VFP2-NEXT:    rsbs r5, r0, #0
-; VFP2-NEXT:    mov.w r12, #-2147483648
-; VFP2-NEXT:    sbcs.w r5, r12, r1
+; VFP2-NEXT:    mov.w lr, #-2147483648
+; VFP2-NEXT:    sbcs.w r5, lr, r1
 ; VFP2-NEXT:    sbcs.w r4, r2, r4
 ; VFP2-NEXT:    sbcs r2, r3
 ; VFP2-NEXT:    mov.w r2, #0
@@ -1650,8 +1645,8 @@
 ; VFP2-NEXT:    movlt r2, #1
 ; VFP2-NEXT:    cmp r2, #0
 ; VFP2-NEXT:    itt eq
-; VFP2-NEXT:    moveq r0, lr
-; VFP2-NEXT:    moveq r1, r12
+; VFP2-NEXT:    moveq r0, r12
+; VFP2-NEXT:    moveq r1, lr
 ; VFP2-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; FULL-LABEL: stest_f32i64:
@@ -1995,26 +1990,25 @@
 ; VFP2-NEXT:    vmov s0, r0
 ; VFP2-NEXT:    bl __fixsfti
 ; VFP2-NEXT:    subs.w r4, r0, #-1
-; VFP2-NEXT:    mov r12, r1
-; VFP2-NEXT:    mvn r1, #-2147483648
-; VFP2-NEXT:    sbcs.w r4, r12, r1
+; VFP2-NEXT:    mvn lr, #-2147483648
+; VFP2-NEXT:    sbcs.w r4, r1, lr
+; VFP2-NEXT:    mov.w r12, #0
 ; VFP2-NEXT:    sbcs r4, r2, #0
-; VFP2-NEXT:    mov.w lr, #0
 ; VFP2-NEXT:    sbcs r4, r3, #0
 ; VFP2-NEXT:    mov.w r4, #0
 ; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt r4, #1
 ; VFP2-NEXT:    cmp r4, #0
-; VFP2-NEXT:    itee eq
+; VFP2-NEXT:    itet eq
 ; VFP2-NEXT:    moveq r3, r4
 ; VFP2-NEXT:    movne r4, r2
-; VFP2-NEXT:    movne r1, r12
+; VFP2-NEXT:    moveq r1, lr
 ; VFP2-NEXT:    mov.w r2, #-1
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r0, r2
 ; VFP2-NEXT:    rsbs r5, r0, #0
-; VFP2-NEXT:    mov.w r12, #-2147483648
-; VFP2-NEXT:    sbcs.w r5, r12, r1
+; VFP2-NEXT:    mov.w lr, #-2147483648
+; VFP2-NEXT:    sbcs.w r5, lr, r1
 ; VFP2-NEXT:    sbcs.w r4, r2, r4
 ; VFP2-NEXT:    sbcs r2, r3
 ; VFP2-NEXT:    mov.w r2, #0
@@ -2022,8 +2016,8 @@
 ; VFP2-NEXT:    movlt r2, #1
 ; VFP2-NEXT:    cmp r2, #0
 ; VFP2-NEXT:    itt eq
-; VFP2-NEXT:    moveq r0, lr
-; VFP2-NEXT:    moveq r1, r12
+; VFP2-NEXT:    moveq r0, r12
+; VFP2-NEXT:    moveq r1, lr
 ; VFP2-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; FULL-LABEL: stest_f16i64:
@@ -2365,24 +2359,23 @@
 ; VFP2-NEXT:    it lo
 ; VFP2-NEXT:    movlo r3, r0
 ; VFP2-NEXT:    cmp r1, #0
-; VFP2-NEXT:    it mi
-; VFP2-NEXT:    movmi r2, r0
-; VFP2-NEXT:    mov.w r0, #-2147483648
+; VFP2-NEXT:    it pl
+; VFP2-NEXT:    movpl r0, r2
+; VFP2-NEXT:    mov.w r2, #-2147483648
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r3
+; VFP2-NEXT:    moveq r0, r3
 ; VFP2-NEXT:    it pl
 ; VFP2-NEXT:    movpl r1, #0
 ; VFP2-NEXT:    cmp.w r1, #-1
 ; VFP2-NEXT:    mov.w r3, #-2147483648
 ; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt r3, r2
-; VFP2-NEXT:    cmp.w r2, #-2147483648
+; VFP2-NEXT:    movgt r3, r0
+; VFP2-NEXT:    cmp.w r0, #-2147483648
 ; VFP2-NEXT:    it ls
-; VFP2-NEXT:    movls r2, r0
-; VFP2-NEXT:    adds r0, r1, #1
+; VFP2-NEXT:    movls r0, r2
+; VFP2-NEXT:    adds r1, #1
 ; VFP2-NEXT:    it ne
-; VFP2-NEXT:    movne r2, r3
-; VFP2-NEXT:    mov r0, r2
+; VFP2-NEXT:    movne r0, r3
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: stest_f64i32_mm:
@@ -2500,16 +2493,16 @@
 ; VFP2-NEXT:    cmp r1, #0
 ; VFP2-NEXT:    it pl
 ; VFP2-NEXT:    movpl.w r2, #-1
-; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r0
-; VFP2-NEXT:    mov.w r0, #0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r0, r2
+; VFP2-NEXT:    mov.w r2, #0
 ; VFP2-NEXT:    it pl
-; VFP2-NEXT:    movpl r1, r0
+; VFP2-NEXT:    movpl r1, r2
 ; VFP2-NEXT:    cmp r1, #0
 ; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt r0, r2
-; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r2
+; VFP2-NEXT:    movgt r2, r0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r0, r2
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: ustest_f64i32_mm:
@@ -2613,24 +2606,23 @@
 ; VFP2-NEXT:    it lo
 ; VFP2-NEXT:    movlo r3, r0
 ; VFP2-NEXT:    cmp r1, #0
-; VFP2-NEXT:    it mi
-; VFP2-NEXT:    movmi r2, r0
-; VFP2-NEXT:    mov.w r0, #-2147483648
+; VFP2-NEXT:    it pl
+; VFP2-NEXT:    movpl r0, r2
+; VFP2-NEXT:    mov.w r2, #-2147483648
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r3
+; VFP2-NEXT:    moveq r0, r3
 ; VFP2-NEXT:    it pl
 ; VFP2-NEXT:    movpl r1, #0
 ; VFP2-NEXT:    cmp.w r1, #-1
 ; VFP2-NEXT:    mov.w r3, #-2147483648
 ; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt r3, r2
-; VFP2-NEXT:    cmp.w r2, #-2147483648
+; VFP2-NEXT:    movgt r3, r0
+; VFP2-NEXT:    cmp.w r0, #-2147483648
 ; VFP2-NEXT:    it ls
-; VFP2-NEXT:    movls r2, r0
-; VFP2-NEXT:    adds r0, r1, #1
+; VFP2-NEXT:    movls r0, r2
+; VFP2-NEXT:    adds r1, #1
 ; VFP2-NEXT:    it ne
-; VFP2-NEXT:    movne r2, r3
-; VFP2-NEXT:    mov r0, r2
+; VFP2-NEXT:    movne r0, r3
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: stest_f32i32_mm:
@@ -2744,16 +2736,16 @@
 ; VFP2-NEXT:    cmp r1, #0
 ; VFP2-NEXT:    it pl
 ; VFP2-NEXT:    movpl.w r2, #-1
-; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r0
-; VFP2-NEXT:    mov.w r0, #0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r0, r2
+; VFP2-NEXT:    mov.w r2, #0
 ; VFP2-NEXT:    it pl
-; VFP2-NEXT:    movpl r1, r0
+; VFP2-NEXT:    movpl r1, r2
 ; VFP2-NEXT:    cmp r1, #0
 ; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt r0, r2
-; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r2
+; VFP2-NEXT:    movgt r2, r0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r0, r2
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: ustest_f32i32_mm:
@@ -2860,24 +2852,23 @@
 ; VFP2-NEXT:    it lo
 ; VFP2-NEXT:    movlo r3, r0
 ; VFP2-NEXT:    cmp r1, #0
-; VFP2-NEXT:    it mi
-; VFP2-NEXT:    movmi r2, r0
-; VFP2-NEXT:    mov.w r0, #-2147483648
+; VFP2-NEXT:    it pl
+; VFP2-NEXT:    movpl r0, r2
+; VFP2-NEXT:    mov.w r2, #-2147483648
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r3
+; VFP2-NEXT:    moveq r0, r3
 ; VFP2-NEXT:    it pl
 ; VFP2-NEXT:    movpl r1, #0
 ; VFP2-NEXT:    cmp.w r1, #-1
 ; VFP2-NEXT:    mov.w r3, #-2147483648
 ; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt r3, r2
-; VFP2-NEXT:    cmp.w r2, #-2147483648
+; VFP2-NEXT:    movgt r3, r0
+; VFP2-NEXT:    cmp.w r0, #-2147483648
 ; VFP2-NEXT:    it ls
-; VFP2-NEXT:    movls r2, r0
-; VFP2-NEXT:    adds r0, r1, #1
+; VFP2-NEXT:    movls r0, r2
+; VFP2-NEXT:    adds r1, #1
 ; VFP2-NEXT:    it ne
-; VFP2-NEXT:    movne r2, r3
-; VFP2-NEXT:    mov r0, r2
+; VFP2-NEXT:    movne r0, r3
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: stest_f16i32_mm:
@@ -3010,16 +3001,16 @@
 ; VFP2-NEXT:    cmp r1, #0
 ; VFP2-NEXT:    it pl
 ; VFP2-NEXT:    movpl.w r2, #-1
-; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r0
-; VFP2-NEXT:    mov.w r0, #0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r0, r2
+; VFP2-NEXT:    mov.w r2, #0
 ; VFP2-NEXT:    it pl
-; VFP2-NEXT:    movpl r1, r0
+; VFP2-NEXT:    movpl r1, r2
 ; VFP2-NEXT:    cmp r1, #0
 ; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt r0, r2
-; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r2
+; VFP2-NEXT:    movgt r2, r0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r0, r2
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: ustest_f16i32_mm:
@@ -3084,13 +3075,13 @@
 ; VFP2-NEXT:    bl __aeabi_d2iz
 ; VFP2-NEXT:    movw r1, #32767
 ; VFP2-NEXT:    cmp r0, r1
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, r0
-; VFP2-NEXT:    movw r0, #32768
-; VFP2-NEXT:    movt r0, #65535
-; VFP2-NEXT:    cmn.w r1, #32768
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt r0, r1
+; VFP2-NEXT:    it ge
+; VFP2-NEXT:    movge r0, r1
+; VFP2-NEXT:    movw r1, #32768
+; VFP2-NEXT:    movt r1, #65535
+; VFP2-NEXT:    cmn.w r0, #32768
+; VFP2-NEXT:    it le
+; VFP2-NEXT:    movle r0, r1
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: stest_f64i16_mm:
@@ -3704,36 +3695,35 @@
 ; VFP2-NEXT:    it ne
 ; VFP2-NEXT:    andne.w r6, r2, r6, asr #31
 ; VFP2-NEXT:    and.w r2, r6, r5
-; VFP2-NEXT:    mov.w lr, #0
-; VFP2-NEXT:    adds r6, r2, #1
+; VFP2-NEXT:    mov.w r6, #-1
+; VFP2-NEXT:    adds r2, #1
 ; VFP2-NEXT:    it ne
 ; VFP2-NEXT:    movne r1, r7
 ; VFP2-NEXT:    mov.w r7, #-1
 ; VFP2-NEXT:    cmp r12, r8
 ; VFP2-NEXT:    it lo
 ; VFP2-NEXT:    movlo r7, r0
-; VFP2-NEXT:    mov.w r2, #-1
+; VFP2-NEXT:    mov.w lr, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r7, r0
 ; VFP2-NEXT:    cmp r3, #0
-; VFP2-NEXT:    it mi
-; VFP2-NEXT:    movmi r2, r0
+; VFP2-NEXT:    it pl
+; VFP2-NEXT:    movpl r0, r6
 ; VFP2-NEXT:    cmp.w r9, #0
-; VFP2-NEXT:    mov.w r0, #0
+; VFP2-NEXT:    mov.w r3, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r7
+; VFP2-NEXT:    moveq r0, r7
 ; VFP2-NEXT:    cmp.w r4, #-2147483648
 ; VFP2-NEXT:    it hi
-; VFP2-NEXT:    movhi r0, r2
+; VFP2-NEXT:    movhi r3, r0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r2
+; VFP2-NEXT:    moveq r3, r0
 ; VFP2-NEXT:    cmp.w r5, #-1
 ; VFP2-NEXT:    it le
-; VFP2-NEXT:    movle r2, lr
-; VFP2-NEXT:    cmp r6, #0
+; VFP2-NEXT:    movle r0, lr
+; VFP2-NEXT:    cmp r2, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r0
-; VFP2-NEXT:    mov r0, r2
+; VFP2-NEXT:    moveq r0, r3
 ; VFP2-NEXT:    add sp, #4
 ; VFP2-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 ;
@@ -4022,59 +4012,58 @@
 ; VFP2-NEXT:    push {r4, r5, r7, lr}
 ; VFP2-NEXT:    bl __fixdfti
 ; VFP2-NEXT:    subs r4, r2, #1
-; VFP2-NEXT:    mov r12, r1
-; VFP2-NEXT:    eor r1, r2, #1
+; VFP2-NEXT:    eor r12, r2, #1
 ; VFP2-NEXT:    sbcs r4, r3, #0
-; VFP2-NEXT:    mov.w lr, #0
-; VFP2-NEXT:    orr.w r5, r1, r3
+; VFP2-NEXT:    orr.w lr, r12, r3
+; VFP2-NEXT:    mov.w r4, #0
+; VFP2-NEXT:    mov.w r5, #1
 ; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt.w lr, #1
+; VFP2-NEXT:    movlt r4, #1
+; VFP2-NEXT:    cmp r4, #0
+; VFP2-NEXT:    it eq
+; VFP2-NEXT:    moveq r0, r4
 ; VFP2-NEXT:    cmp.w lr, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r0, lr
-; VFP2-NEXT:    cmp r5, #0
+; VFP2-NEXT:    cmp r4, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r5
+; VFP2-NEXT:    moveq r1, r4
 ; VFP2-NEXT:    cmp.w lr, #0
-; VFP2-NEXT:    it ne
-; VFP2-NEXT:    movne lr, r12
-; VFP2-NEXT:    cmp r5, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq lr, r5
-; VFP2-NEXT:    cmp.w lr, #0
-; VFP2-NEXT:    mov r12, lr
+; VFP2-NEXT:    moveq r1, lr
+; VFP2-NEXT:    cmp r1, #0
+; VFP2-NEXT:    mov lr, r1
 ; VFP2-NEXT:    mov.w r4, #1
 ; VFP2-NEXT:    ite ne
-; VFP2-NEXT:    movne r12, r0
-; VFP2-NEXT:    moveq r12, r0
+; VFP2-NEXT:    movne lr, r0
+; VFP2-NEXT:    moveq lr, r0
 ; VFP2-NEXT:    cmp r2, #1
-; VFP2-NEXT:    mov.w r5, #1
-; VFP2-NEXT:    mov.w r1, #0
 ; VFP2-NEXT:    it lo
 ; VFP2-NEXT:    movlo r5, r2
 ; VFP2-NEXT:    cmp r3, #0
 ; VFP2-NEXT:    it mi
 ; VFP2-NEXT:    movmi r4, r2
+; VFP2-NEXT:    mov.w r12, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r4, r5
 ; VFP2-NEXT:    it pl
-; VFP2-NEXT:    movpl r3, r1
+; VFP2-NEXT:    movpl r3, r12
 ; VFP2-NEXT:    rsbs r2, r4, #0
-; VFP2-NEXT:    sbcs.w r2, r1, r3
+; VFP2-NEXT:    sbcs.w r2, r12, r3
 ; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, #1
-; VFP2-NEXT:    cmp r1, #0
-; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r1
-; VFP2-NEXT:    orrs.w r2, r4, r3
+; VFP2-NEXT:    movlt.w r12, #1
+; VFP2-NEXT:    cmp.w r12, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r0, r12
-; VFP2-NEXT:    cmp r1, #0
-; VFP2-NEXT:    it ne
-; VFP2-NEXT:    movne r1, lr
-; VFP2-NEXT:    cmp r2, #0
+; VFP2-NEXT:    orrs.w r2, r4, r3
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r1, lr
+; VFP2-NEXT:    moveq r0, lr
+; VFP2-NEXT:    cmp.w r12, #0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r12, r1
+; VFP2-NEXT:    cmp r2, #0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r1, r12
 ; VFP2-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; FULL-LABEL: ustest_f64i64_mm:
@@ -4293,36 +4282,35 @@
 ; VFP2-NEXT:    it ne
 ; VFP2-NEXT:    andne.w r6, r2, r6, asr #31
 ; VFP2-NEXT:    and.w r2, r6, r5
-; VFP2-NEXT:    mov.w lr, #0
-; VFP2-NEXT:    adds r6, r2, #1
+; VFP2-NEXT:    mov.w r6, #-1
+; VFP2-NEXT:    adds r2, #1
 ; VFP2-NEXT:    it ne
 ; VFP2-NEXT:    movne r1, r7
 ; VFP2-NEXT:    mov.w r7, #-1
 ; VFP2-NEXT:    cmp r12, r8
 ; VFP2-NEXT:    it lo
 ; VFP2-NEXT:    movlo r7, r0
-; VFP2-NEXT:    mov.w r2, #-1
+; VFP2-NEXT:    mov.w lr, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r7, r0
 ; VFP2-NEXT:    cmp r3, #0
-; VFP2-NEXT:    it mi
-; VFP2-NEXT:    movmi r2, r0
+; VFP2-NEXT:    it pl
+; VFP2-NEXT:    movpl r0, r6
 ; VFP2-NEXT:    cmp.w r9, #0
-; VFP2-NEXT:    mov.w r0, #0
+; VFP2-NEXT:    mov.w r3, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r7
+; VFP2-NEXT:    moveq r0, r7
 ; VFP2-NEXT:    cmp.w r4, #-2147483648
 ; VFP2-NEXT:    it hi
-; VFP2-NEXT:    movhi r0, r2
+; VFP2-NEXT:    movhi r3, r0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r2
+; VFP2-NEXT:    moveq r3, r0
 ; VFP2-NEXT:    cmp.w r5, #-1
 ; VFP2-NEXT:    it le
-; VFP2-NEXT:    movle r2, lr
-; VFP2-NEXT:    cmp r6, #0
+; VFP2-NEXT:    movle r0, lr
+; VFP2-NEXT:    cmp r2, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r0
-; VFP2-NEXT:    mov r0, r2
+; VFP2-NEXT:    moveq r0, r3
 ; VFP2-NEXT:    add sp, #4
 ; VFP2-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 ;
@@ -4605,59 +4593,58 @@
 ; VFP2-NEXT:    push {r4, r5, r7, lr}
 ; VFP2-NEXT:    bl __fixsfti
 ; VFP2-NEXT:    subs r4, r2, #1
-; VFP2-NEXT:    mov r12, r1
-; VFP2-NEXT:    eor r1, r2, #1
+; VFP2-NEXT:    eor r12, r2, #1
 ; VFP2-NEXT:    sbcs r4, r3, #0
-; VFP2-NEXT:    mov.w lr, #0
-; VFP2-NEXT:    orr.w r5, r1, r3
+; VFP2-NEXT:    orr.w lr, r12, r3
+; VFP2-NEXT:    mov.w r4, #0
+; VFP2-NEXT:    mov.w r5, #1
 ; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt.w lr, #1
+; VFP2-NEXT:    movlt r4, #1
+; VFP2-NEXT:    cmp r4, #0
+; VFP2-NEXT:    it eq
+; VFP2-NEXT:    moveq r0, r4
 ; VFP2-NEXT:    cmp.w lr, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r0, lr
-; VFP2-NEXT:    cmp r5, #0
+; VFP2-NEXT:    cmp r4, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r5
+; VFP2-NEXT:    moveq r1, r4
 ; VFP2-NEXT:    cmp.w lr, #0
-; VFP2-NEXT:    it ne
-; VFP2-NEXT:    movne lr, r12
-; VFP2-NEXT:    cmp r5, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq lr, r5
-; VFP2-NEXT:    cmp.w lr, #0
-; VFP2-NEXT:    mov r12, lr
+; VFP2-NEXT:    moveq r1, lr
+; VFP2-NEXT:    cmp r1, #0
+; VFP2-NEXT:    mov lr, r1
 ; VFP2-NEXT:    mov.w r4, #1
 ; VFP2-NEXT:    ite ne
-; VFP2-NEXT:    movne r12, r0
-; VFP2-NEXT:    moveq r12, r0
+; VFP2-NEXT:    movne lr, r0
+; VFP2-NEXT:    moveq lr, r0
 ; VFP2-NEXT:    cmp r2, #1
-; VFP2-NEXT:    mov.w r5, #1
-; VFP2-NEXT:    mov.w r1, #0
 ; VFP2-NEXT:    it lo
 ; VFP2-NEXT:    movlo r5, r2
 ; VFP2-NEXT:    cmp r3, #0
 ; VFP2-NEXT:    it mi
 ; VFP2-NEXT:    movmi r4, r2
+; VFP2-NEXT:    mov.w r12, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r4, r5
 ; VFP2-NEXT:    it pl
-; VFP2-NEXT:    movpl r3, r1
+; VFP2-NEXT:    movpl r3, r12
 ; VFP2-NEXT:    rsbs r2, r4, #0
-; VFP2-NEXT:    sbcs.w r2, r1, r3
+; VFP2-NEXT:    sbcs.w r2, r12, r3
 ; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, #1
-; VFP2-NEXT:    cmp r1, #0
-; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r1
-; VFP2-NEXT:    orrs.w r2, r4, r3
+; VFP2-NEXT:    movlt.w r12, #1
+; VFP2-NEXT:    cmp.w r12, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r0, r12
-; VFP2-NEXT:    cmp r1, #0
-; VFP2-NEXT:    it ne
-; VFP2-NEXT:    movne r1, lr
-; VFP2-NEXT:    cmp r2, #0
+; VFP2-NEXT:    orrs.w r2, r4, r3
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r1, lr
+; VFP2-NEXT:    moveq r0, lr
+; VFP2-NEXT:    cmp.w r12, #0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r12, r1
+; VFP2-NEXT:    cmp r2, #0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r1, r12
 ; VFP2-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; FULL-LABEL: ustest_f32i64_mm:
@@ -4881,36 +4868,35 @@
 ; VFP2-NEXT:    it ne
 ; VFP2-NEXT:    andne.w r6, r2, r6, asr #31
 ; VFP2-NEXT:    and.w r2, r6, r5
-; VFP2-NEXT:    mov.w lr, #0
-; VFP2-NEXT:    adds r6, r2, #1
+; VFP2-NEXT:    mov.w r6, #-1
+; VFP2-NEXT:    adds r2, #1
 ; VFP2-NEXT:    it ne
 ; VFP2-NEXT:    movne r1, r7
 ; VFP2-NEXT:    mov.w r7, #-1
 ; VFP2-NEXT:    cmp r12, r8
 ; VFP2-NEXT:    it lo
 ; VFP2-NEXT:    movlo r7, r0
-; VFP2-NEXT:    mov.w r2, #-1
+; VFP2-NEXT:    mov.w lr, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r7, r0
 ; VFP2-NEXT:    cmp r3, #0
-; VFP2-NEXT:    it mi
-; VFP2-NEXT:    movmi r2, r0
+; VFP2-NEXT:    it pl
+; VFP2-NEXT:    movpl r0, r6
 ; VFP2-NEXT:    cmp.w r9, #0
-; VFP2-NEXT:    mov.w r0, #0
+; VFP2-NEXT:    mov.w r3, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r7
+; VFP2-NEXT:    moveq r0, r7
 ; VFP2-NEXT:    cmp.w r4, #-2147483648
 ; VFP2-NEXT:    it hi
-; VFP2-NEXT:    movhi r0, r2
+; VFP2-NEXT:    movhi r3, r0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r2
+; VFP2-NEXT:    moveq r3, r0
 ; VFP2-NEXT:    cmp.w r5, #-1
 ; VFP2-NEXT:    it le
-; VFP2-NEXT:    movle r2, lr
-; VFP2-NEXT:    cmp r6, #0
+; VFP2-NEXT:    movle r0, lr
+; VFP2-NEXT:    cmp r2, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r2, r0
-; VFP2-NEXT:    mov r0, r2
+; VFP2-NEXT:    moveq r0, r3
 ; VFP2-NEXT:    add sp, #4
 ; VFP2-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 ;
@@ -5207,59 +5193,58 @@
 ; VFP2-NEXT:    vmov s0, r0
 ; VFP2-NEXT:    bl __fixsfti
 ; VFP2-NEXT:    subs r4, r2, #1
-; VFP2-NEXT:    mov r12, r1
-; VFP2-NEXT:    eor r1, r2, #1
+; VFP2-NEXT:    eor r12, r2, #1
 ; VFP2-NEXT:    sbcs r4, r3, #0
-; VFP2-NEXT:    mov.w lr, #0
-; VFP2-NEXT:    orr.w r5, r1, r3
+; VFP2-NEXT:    orr.w lr, r12, r3
+; VFP2-NEXT:    mov.w r4, #0
+; VFP2-NEXT:    mov.w r5, #1
 ; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt.w lr, #1
+; VFP2-NEXT:    movlt r4, #1
+; VFP2-NEXT:    cmp r4, #0
+; VFP2-NEXT:    it eq
+; VFP2-NEXT:    moveq r0, r4
 ; VFP2-NEXT:    cmp.w lr, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r0, lr
-; VFP2-NEXT:    cmp r5, #0
+; VFP2-NEXT:    cmp r4, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r5
+; VFP2-NEXT:    moveq r1, r4
 ; VFP2-NEXT:    cmp.w lr, #0
-; VFP2-NEXT:    it ne
-; VFP2-NEXT:    movne lr, r12
-; VFP2-NEXT:    cmp r5, #0
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq lr, r5
-; VFP2-NEXT:    cmp.w lr, #0
-; VFP2-NEXT:    mov r12, lr
+; VFP2-NEXT:    moveq r1, lr
+; VFP2-NEXT:    cmp r1, #0
+; VFP2-NEXT:    mov lr, r1
 ; VFP2-NEXT:    mov.w r4, #1
 ; VFP2-NEXT:    ite ne
-; VFP2-NEXT:    movne r12, r0
-; VFP2-NEXT:    moveq r12, r0
+; VFP2-NEXT:    movne lr, r0
+; VFP2-NEXT:    moveq lr, r0
 ; VFP2-NEXT:    cmp r2, #1
-; VFP2-NEXT:    mov.w r5, #1
-; VFP2-NEXT:    mov.w r1, #0
 ; VFP2-NEXT:    it lo
 ; VFP2-NEXT:    movlo r5, r2
 ; VFP2-NEXT:    cmp r3, #0
 ; VFP2-NEXT:    it mi
 ; VFP2-NEXT:    movmi r4, r2
+; VFP2-NEXT:    mov.w r12, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r4, r5
 ; VFP2-NEXT:    it pl
-; VFP2-NEXT:    movpl r3, r1
+; VFP2-NEXT:    movpl r3, r12
 ; VFP2-NEXT:    rsbs r2, r4, #0
-; VFP2-NEXT:    sbcs.w r2, r1, r3
+; VFP2-NEXT:    sbcs.w r2, r12, r3
 ; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, #1
-; VFP2-NEXT:    cmp r1, #0
-; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r0, r1
-; VFP2-NEXT:    orrs.w r2, r4, r3
+; VFP2-NEXT:    movlt.w r12, #1
+; VFP2-NEXT:    cmp.w r12, #0
 ; VFP2-NEXT:    it eq
 ; VFP2-NEXT:    moveq r0, r12
-; VFP2-NEXT:    cmp r1, #0
-; VFP2-NEXT:    it ne
-; VFP2-NEXT:    movne r1, lr
-; VFP2-NEXT:    cmp r2, #0
+; VFP2-NEXT:    orrs.w r2, r4, r3
 ; VFP2-NEXT:    it eq
-; VFP2-NEXT:    moveq r1, lr
+; VFP2-NEXT:    moveq r0, lr
+; VFP2-NEXT:    cmp.w r12, #0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r12, r1
+; VFP2-NEXT:    cmp r2, #0
+; VFP2-NEXT:    it ne
+; VFP2-NEXT:    movne r1, r12
 ; VFP2-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; FULL-LABEL: ustest_f16i64_mm:
diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index c4fce77..cddbc5d 100644
--- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -32,8 +32,8 @@
 ; THUMB6-NEXT:    uxtb r1, r1
 ; THUMB6-NEXT:    lsls r0, r1
 ; THUMB6-NEXT:    movs r1, #128
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ;
@@ -64,8 +64,8 @@
 ; THUMB6-NEXT:    uxtb r1, r1
 ; THUMB6-NEXT:    lsls r0, r1
 ; THUMB6-NEXT:    movs r1, #1
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ;
@@ -97,8 +97,8 @@
 ; THUMB6-NEXT:    uxtb r1, r1
 ; THUMB6-NEXT:    lsls r0, r1
 ; THUMB6-NEXT:    movs r1, #24
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ;
@@ -134,8 +134,8 @@
 ; THUMB6-NEXT:    lsls r0, r1
 ; THUMB6-NEXT:    movs r1, #1
 ; THUMB6-NEXT:    lsls r1, r1, #15
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ;
@@ -166,8 +166,8 @@
 ; THUMB6-NEXT:    uxth r1, r1
 ; THUMB6-NEXT:    lsls r0, r1
 ; THUMB6-NEXT:    movs r1, #1
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ;
@@ -200,8 +200,8 @@
 ; THUMB6-NEXT:    lsls r0, r1
 ; THUMB6-NEXT:    movs r1, #255
 ; THUMB6-NEXT:    lsls r1, r1, #4
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ;
@@ -233,8 +233,8 @@
 ; THUMB6-NEXT:    lsls r0, r1
 ; THUMB6-NEXT:    movs r1, #1
 ; THUMB6-NEXT:    lsls r1, r1, #31
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ;
@@ -261,8 +261,8 @@
 ; THUMB6:       @ %bb.0:
 ; THUMB6-NEXT:    lsls r0, r1
 ; THUMB6-NEXT:    movs r1, #1
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ;
@@ -301,8 +301,8 @@
 ; THUMB6:       @ %bb.0:
 ; THUMB6-NEXT:    lsls r0, r1
 ; THUMB6-NEXT:    ldr r1, .LCPI8_0
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ; THUMB6-NEXT:    .p2align 2
@@ -406,8 +406,8 @@
 ; THUMB6-NEXT:    push {r7, lr}
 ; THUMB6-NEXT:    bl __ashldi3
 ; THUMB6-NEXT:    movs r1, #1
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    pop {r7, pc}
 ;
@@ -642,14 +642,14 @@
 ; THUMB6-NEXT:    ldr r4, [sp, #16]
 ; THUMB6-NEXT:    lsls r2, r4
 ; THUMB6-NEXT:    ldr r4, .LCPI13_0
-; THUMB6-NEXT:    ands r4, r2
-; THUMB6-NEXT:    rsbs r2, r4, #0
+; THUMB6-NEXT:    ands r2, r4
+; THUMB6-NEXT:    rsbs r4, r2, #0
 ; THUMB6-NEXT:    adcs r2, r4
 ; THUMB6-NEXT:    ldr r4, [sp, #20]
 ; THUMB6-NEXT:    lsls r3, r4
 ; THUMB6-NEXT:    lsls r4, r0, #31
-; THUMB6-NEXT:    ands r4, r3
-; THUMB6-NEXT:    rsbs r3, r4, #0
+; THUMB6-NEXT:    ands r3, r4
+; THUMB6-NEXT:    rsbs r4, r3, #0
 ; THUMB6-NEXT:    adcs r3, r4
 ; THUMB6-NEXT:    pop {r4, pc}
 ; THUMB6-NEXT:    .p2align 2
diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index a5d3347..4f112c3 100644
--- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -303,8 +303,8 @@
 ; THUMB6-NEXT:    lsrs r0, r1
 ; THUMB6-NEXT:    movs r1, #1
 ; THUMB6-NEXT:    lsls r1, r1, #31
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ;
@@ -331,8 +331,8 @@
 ; THUMB6:       @ %bb.0:
 ; THUMB6-NEXT:    lsrs r0, r1
 ; THUMB6-NEXT:    movs r1, #1
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ;
@@ -371,8 +371,8 @@
 ; THUMB6:       @ %bb.0:
 ; THUMB6-NEXT:    lsrs r0, r1
 ; THUMB6-NEXT:    ldr r1, .LCPI8_0
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    bx lr
 ; THUMB6-NEXT:    .p2align 2
@@ -459,8 +459,8 @@
 ; THUMB6-NEXT:    push {r7, lr}
 ; THUMB6-NEXT:    bl __lshrdi3
 ; THUMB6-NEXT:    movs r1, #1
-; THUMB6-NEXT:    ands r1, r0
-; THUMB6-NEXT:    rsbs r0, r1, #0
+; THUMB6-NEXT:    ands r0, r1
+; THUMB6-NEXT:    rsbs r1, r0, #0
 ; THUMB6-NEXT:    adcs r0, r1
 ; THUMB6-NEXT:    pop {r7, pc}
 ;
@@ -713,14 +713,14 @@
 ; THUMB6-NEXT:    ldr r4, [sp, #16]
 ; THUMB6-NEXT:    lsrs r2, r4
 ; THUMB6-NEXT:    ldr r4, .LCPI13_0
-; THUMB6-NEXT:    ands r4, r2
-; THUMB6-NEXT:    rsbs r2, r4, #0
+; THUMB6-NEXT:    ands r2, r4
+; THUMB6-NEXT:    rsbs r4, r2, #0
 ; THUMB6-NEXT:    adcs r2, r4
 ; THUMB6-NEXT:    ldr r4, [sp, #20]
 ; THUMB6-NEXT:    lsrs r3, r4
 ; THUMB6-NEXT:    lsls r4, r0, #31
-; THUMB6-NEXT:    ands r4, r3
-; THUMB6-NEXT:    rsbs r3, r4, #0
+; THUMB6-NEXT:    ands r3, r4
+; THUMB6-NEXT:    rsbs r4, r3, #0
 ; THUMB6-NEXT:    adcs r3, r4
 ; THUMB6-NEXT:    pop {r4, pc}
 ; THUMB6-NEXT:    .p2align 2
diff --git a/llvm/test/CodeGen/ARM/ssat.ll b/llvm/test/CodeGen/ARM/ssat.ll
index 3e1cdd4..436f6ed 100644
--- a/llvm/test/CodeGen/ARM/ssat.ll
+++ b/llvm/test/CodeGen/ARM/ssat.ll
@@ -21,11 +21,11 @@
 ; V4T:       @ %bb.0: @ %entry
 ; V4T-NEXT:    ldr r1, .LCPI0_0
 ; V4T-NEXT:    cmp r0, r1
-; V4T-NEXT:    movlt r1, r0
-; V4T-NEXT:    mov r0, #1065353216
-; V4T-NEXT:    orr r0, r0, #-1073741824
-; V4T-NEXT:    cmn r1, #8388608
-; V4T-NEXT:    movgt r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    mov r1, #1065353216
+; V4T-NEXT:    orr r1, r1, #-1073741824
+; V4T-NEXT:    cmn r0, #8388608
+; V4T-NEXT:    movle r0, r1
 ; V4T-NEXT:    bx lr
 ; V4T-NEXT:    .p2align 2
 ; V4T-NEXT:  @ %bb.1:
@@ -54,12 +54,12 @@
 ; V4T-NEXT:    orr r2, r2, #1792
 ; V4T-NEXT:    asr r1, r1, #16
 ; V4T-NEXT:    cmp r1, r2
-; V4T-NEXT:    movlt r2, r0
-; V4T-NEXT:    lsl r0, r2, #16
-; V4T-NEXT:    asr r1, r0, #16
-; V4T-NEXT:    ldr r0, .LCPI1_0
+; V4T-NEXT:    movge r0, r2
+; V4T-NEXT:    ldr r2, .LCPI1_0
+; V4T-NEXT:    lsl r1, r0, #16
+; V4T-NEXT:    asr r1, r1, #16
 ; V4T-NEXT:    cmn r1, #2048
-; V4T-NEXT:    movgt r0, r2
+; V4T-NEXT:    movle r0, r2
 ; V4T-NEXT:    bx lr
 ; V4T-NEXT:    .p2align 2
 ; V4T-NEXT:  @ %bb.1:
@@ -71,12 +71,12 @@
 ; V6T2-NEXT:    sxth r1, r0
 ; V6T2-NEXT:    movw r2, #2047
 ; V6T2-NEXT:    cmp r1, r2
-; V6T2-NEXT:    movlt r2, r0
-; V6T2-NEXT:    movw r0, #63488
-; V6T2-NEXT:    sxth r1, r2
-; V6T2-NEXT:    movt r0, #65535
+; V6T2-NEXT:    movge r0, r2
+; V6T2-NEXT:    movw r2, #63488
+; V6T2-NEXT:    sxth r1, r0
+; V6T2-NEXT:    movt r2, #65535
 ; V6T2-NEXT:    cmn r1, #2048
-; V6T2-NEXT:    movgt r0, r2
+; V6T2-NEXT:    movle r0, r2
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = icmp slt i16 %x, 2047
@@ -130,11 +130,11 @@
 ; V4T:       @ %bb.0: @ %entry
 ; V4T-NEXT:    ldr r1, .LCPI3_0
 ; V4T-NEXT:    cmp r0, r1
-; V4T-NEXT:    movlt r1, r0
-; V4T-NEXT:    mov r0, #1065353216
-; V4T-NEXT:    orr r0, r0, #-1073741824
-; V4T-NEXT:    cmn r1, #8388608
-; V4T-NEXT:    movgt r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    mov r1, #1065353216
+; V4T-NEXT:    orr r1, r1, #-1073741824
+; V4T-NEXT:    cmn r0, #8388608
+; V4T-NEXT:    movle r0, r1
 ; V4T-NEXT:    bx lr
 ; V4T-NEXT:    .p2align 2
 ; V4T-NEXT:  @ %bb.1:
@@ -159,11 +159,11 @@
 ; V4T:       @ %bb.0: @ %entry
 ; V4T-NEXT:    ldr r1, .LCPI4_0
 ; V4T-NEXT:    cmp r0, r1
-; V4T-NEXT:    movlt r1, r0
-; V4T-NEXT:    mov r0, #1065353216
-; V4T-NEXT:    orr r0, r0, #-1073741824
-; V4T-NEXT:    cmn r1, #8388608
-; V4T-NEXT:    movgt r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    mov r1, #1065353216
+; V4T-NEXT:    orr r1, r1, #-1073741824
+; V4T-NEXT:    cmn r0, #8388608
+; V4T-NEXT:    movle r0, r1
 ; V4T-NEXT:    bx lr
 ; V4T-NEXT:    .p2align 2
 ; V4T-NEXT:  @ %bb.1:
@@ -189,11 +189,10 @@
 ; V4T-NEXT:    mov r1, #1065353216
 ; V4T-NEXT:    cmn r0, #8388608
 ; V4T-NEXT:    orr r1, r1, #-1073741824
-; V4T-NEXT:    movgt r1, r0
-; V4T-NEXT:    ldr r0, .LCPI5_0
-; V4T-NEXT:    cmp r1, r0
-; V4T-NEXT:    movge r1, r0
-; V4T-NEXT:    mov r0, r1
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    ldr r1, .LCPI5_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
 ; V4T-NEXT:    bx lr
 ; V4T-NEXT:    .p2align 2
 ; V4T-NEXT:  @ %bb.1:
@@ -219,11 +218,10 @@
 ; V4T-NEXT:    mov r1, #1065353216
 ; V4T-NEXT:    cmn r0, #8388608
 ; V4T-NEXT:    orr r1, r1, #-1073741824
-; V4T-NEXT:    movgt r1, r0
-; V4T-NEXT:    ldr r0, .LCPI6_0
-; V4T-NEXT:    cmp r1, r0
-; V4T-NEXT:    movge r1, r0
-; V4T-NEXT:    mov r0, r1
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    ldr r1, .LCPI6_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
 ; V4T-NEXT:    bx lr
 ; V4T-NEXT:    .p2align 2
 ; V4T-NEXT:  @ %bb.1:
@@ -249,11 +247,10 @@
 ; V4T-NEXT:    mov r1, #1065353216
 ; V4T-NEXT:    cmn r0, #8388608
 ; V4T-NEXT:    orr r1, r1, #-1073741824
-; V4T-NEXT:    movgt r1, r0
-; V4T-NEXT:    ldr r0, .LCPI7_0
-; V4T-NEXT:    cmp r1, r0
-; V4T-NEXT:    movge r1, r0
-; V4T-NEXT:    mov r0, r1
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    ldr r1, .LCPI7_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
 ; V4T-NEXT:    bx lr
 ; V4T-NEXT:    .p2align 2
 ; V4T-NEXT:  @ %bb.1:
@@ -284,11 +281,10 @@
 ; V4T-NEXT:    mov r1, #1065353216
 ; V4T-NEXT:    cmn r0, #8388608
 ; V4T-NEXT:    orr r1, r1, #-1073741824
-; V4T-NEXT:    movgt r1, r0
-; V4T-NEXT:    ldr r0, .LCPI8_0
-; V4T-NEXT:    cmp r1, r0
-; V4T-NEXT:    movge r1, r0
-; V4T-NEXT:    mov r0, r1
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    ldr r1, .LCPI8_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
 ; V4T-NEXT:    bx lr
 ; V4T-NEXT:    .p2align 2
 ; V4T-NEXT:  @ %bb.1:
@@ -435,11 +431,10 @@
 ; V4T:       @ %bb.0: @ %entry
 ; V4T-NEXT:    ldr r1, .LCPI12_0
 ; V4T-NEXT:    cmp r0, r1
-; V4T-NEXT:    movgt r1, r0
-; V4T-NEXT:    ldr r0, .LCPI12_1
-; V4T-NEXT:    cmp r1, r0
-; V4T-NEXT:    movge r1, r0
-; V4T-NEXT:    mov r0, r1
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    ldr r1, .LCPI12_1
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
 ; V4T-NEXT:    bx lr
 ; V4T-NEXT:    .p2align 2
 ; V4T-NEXT:  @ %bb.1:
@@ -453,12 +448,11 @@
 ; V6T2-NEXT:    movw r1, #47768
 ; V6T2-NEXT:    movt r1, #65244
 ; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movgt r1, r0
-; V6T2-NEXT:    movw r0, #65535
-; V6T2-NEXT:    movt r0, #127
-; V6T2-NEXT:    cmp r1, r0
-; V6T2-NEXT:    movge r1, r0
-; V6T2-NEXT:    mov r0, r1
+; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movge r0, r1
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = icmp sgt i32 %x, -19088744
diff --git a/llvm/test/CodeGen/ARM/usat.ll b/llvm/test/CodeGen/ARM/usat.ll
index 9bb4104..84de3c9 100644
--- a/llvm/test/CodeGen/ARM/usat.ll
+++ b/llvm/test/CodeGen/ARM/usat.ll
@@ -52,42 +52,39 @@
 define i16 @unsigned_sat_base_16bit(i16 %x) #0 {
 ; V4T-LABEL: unsigned_sat_base_16bit:
 ; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    mov r2, #255
 ; V4T-NEXT:    lsl r1, r0, #16
-; V4T-NEXT:    asr r2, r1, #16
-; V4T-NEXT:    mov r1, #255
-; V4T-NEXT:    orr r1, r1, #1792
-; V4T-NEXT:    cmp r2, r1
-; V4T-NEXT:    movlt r1, r0
-; V4T-NEXT:    lsl r0, r1, #16
-; V4T-NEXT:    asr r0, r0, #16
-; V4T-NEXT:    cmp r0, #0
-; V4T-NEXT:    movle r1, #0
-; V4T-NEXT:    mov r0, r1
+; V4T-NEXT:    orr r2, r2, #1792
+; V4T-NEXT:    asr r1, r1, #16
+; V4T-NEXT:    cmp r1, r2
+; V4T-NEXT:    movge r0, r2
+; V4T-NEXT:    lsl r1, r0, #16
+; V4T-NEXT:    asr r1, r1, #16
+; V4T-NEXT:    cmp r1, #0
+; V4T-NEXT:    movle r0, #0
 ; V4T-NEXT:    bx lr
 ;
 ; V6-LABEL: unsigned_sat_base_16bit:
 ; V6:       @ %bb.0: @ %entry
-; V6-NEXT:    mov r1, #255
-; V6-NEXT:    sxth r2, r0
-; V6-NEXT:    orr r1, r1, #1792
-; V6-NEXT:    cmp r2, r1
-; V6-NEXT:    movlt r1, r0
-; V6-NEXT:    sxth r0, r1
-; V6-NEXT:    cmp r0, #0
-; V6-NEXT:    movle r1, #0
-; V6-NEXT:    mov r0, r1
+; V6-NEXT:    mov r2, #255
+; V6-NEXT:    sxth r1, r0
+; V6-NEXT:    orr r2, r2, #1792
+; V6-NEXT:    cmp r1, r2
+; V6-NEXT:    movge r0, r2
+; V6-NEXT:    sxth r1, r0
+; V6-NEXT:    cmp r1, #0
+; V6-NEXT:    movle r0, #0
 ; V6-NEXT:    bx lr
 ;
 ; V6T2-LABEL: unsigned_sat_base_16bit:
 ; V6T2:       @ %bb.0: @ %entry
-; V6T2-NEXT:    sxth r2, r0
-; V6T2-NEXT:    movw r1, #2047
-; V6T2-NEXT:    cmp r2, r1
-; V6T2-NEXT:    movlt r1, r0
-; V6T2-NEXT:    sxth r0, r1
-; V6T2-NEXT:    cmp r0, #0
-; V6T2-NEXT:    movle r1, #0
-; V6T2-NEXT:    mov r0, r1
+; V6T2-NEXT:    sxth r1, r0
+; V6T2-NEXT:    movw r2, #2047
+; V6T2-NEXT:    cmp r1, r2
+; V6T2-NEXT:    movge r0, r2
+; V6T2-NEXT:    sxth r1, r0
+; V6T2-NEXT:    cmp r1, #0
+; V6T2-NEXT:    movle r0, #0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = icmp slt i16 %x, 2047
diff --git a/llvm/test/CodeGen/SystemZ/int-div-01.ll b/llvm/test/CodeGen/SystemZ/int-div-01.ll
index f67d6dc..04bff41 100644
--- a/llvm/test/CodeGen/SystemZ/int-div-01.ll
+++ b/llvm/test/CodeGen/SystemZ/int-div-01.ll
@@ -51,7 +51,8 @@
 ; CHECK-NOT: {{%r[234]}}
 ; CHECK: dsgfr %r2, %r4
 ; CHECK-NOT: dsgfr
-; CHECK: or %r2, %r3
+; CHECK: or %r3, %r2
+; CHECK: lr %r2, %r3
 ; CHECK: br %r14
   %div = sdiv i32 %a, %b
   %rem = srem i32 %a, %b
diff --git a/llvm/test/CodeGen/SystemZ/int-div-03.ll b/llvm/test/CodeGen/SystemZ/int-div-03.ll
index 48f99c2..c4a00ca 100644
--- a/llvm/test/CodeGen/SystemZ/int-div-03.ll
+++ b/llvm/test/CodeGen/SystemZ/int-div-03.ll
@@ -36,7 +36,8 @@
 ; CHECK-LABEL: f3:
 ; CHECK-NOT: {{%r[234]}}
 ; CHECK: dsgfr %r2, %r4
-; CHECK: ogr %r2, %r3
+; CHECK: ogr %r3, %r2
+; CHECK: lgr %r2, %r3
 ; CHECK: br %r14
   %bext = sext i32 %b to i64
   %div = sdiv i64 %a, %bext
@@ -102,7 +103,8 @@
 ; CHECK-NOT: {{%r[234]}}
 ; CHECK: dsgf %r2, 0(%r4)
 ; CHECK-NOT: {{dsgf|dsgfr}}
-; CHECK: ogr %r2, %r3
+; CHECK: ogr %r3, %r2
+; CHECK: lgr %r2, %r3
 ; CHECK: br %r14
   %b = load i32, i32 *%src
   %bext = sext i32 %b to i64
diff --git a/llvm/test/CodeGen/SystemZ/int-div-04.ll b/llvm/test/CodeGen/SystemZ/int-div-04.ll
index c06d7bd..0448eed 100644
--- a/llvm/test/CodeGen/SystemZ/int-div-04.ll
+++ b/llvm/test/CodeGen/SystemZ/int-div-04.ll
@@ -34,7 +34,8 @@
 ; CHECK-NOT: {{%r[234]}}
 ; CHECK: dsgr %r2, %r4
 ; CHECK-NOT: dsgr
-; CHECK: ogr %r2, %r3
+; CHECK: ogr %r3, %r2
+; CHECK: lgr %r2, %r3
 ; CHECK: br %r14
   %div = sdiv i64 %a, %b
   %rem = srem i64 %a, %b
@@ -74,7 +75,8 @@
 ; CHECK-NOT: {{%r[234]}}
 ; CHECK: dsg %r2, 0(%r4)
 ; CHECK-NOT: {{dsg|dsgr}}
-; CHECK: ogr %r2, %r3
+; CHECK: ogr %r3, %r2
+; CHECK: lgr %r2, %r3
 ; CHECK: br %r14
   %b = load i64, i64 *%src
   %div = sdiv i64 %a, %b
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-08.ll b/llvm/test/CodeGen/SystemZ/int-mul-08.ll
index 75aba3c..cb61a0c 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-08.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-08.ll
@@ -60,7 +60,8 @@
 ; CHECK-LABEL: f4:
 ; CHECK-NOT: {{%r[234]}}
 ; CHECK: mlgr %r2, %r4
-; CHECK: ogr %r2, %r3
+; CHECK: ogr %r3, %r2
+; CHECK: lgr %r2, %r3
 ; CHECK: br %r14
   %ax = zext i64 %a to i128
   %bx = zext i64 %b to i128
diff --git a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
index 2eac1f4..a704d7b5 100644
--- a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
@@ -117,10 +117,10 @@
 ; CHECK-NEXT:    mvns r3, r7
 ; CHECK-NEXT:    ldr r0, [sp, #32]
 ; CHECK-NEXT:    bl __aeabi_ldivmod
-; CHECK-NEXT:    ands r3, r5
+; CHECK-NEXT:    ands r5, r3
 ; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    eors r2, r0
-; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    orrs r2, r5
 ; CHECK-NEXT:    subs r0, r2, #1
 ; CHECK-NEXT:    sbcs r2, r0
 ; CHECK-NEXT:    movs r0, r4
diff --git a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
index 6f8661f..6e90381 100644
--- a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
+++ b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
@@ -284,9 +284,8 @@
 define i64 @t5_cse(i64 %val, i64 %shamt, i64*%dst) nounwind {
 ; X64-NOBMI2-LABEL: t5_cse:
 ; X64-NOBMI2:       # %bb.0:
-; X64-NOBMI2-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
-; X64-NOBMI2-NEXT:    addq $32, %rcx
+; X64-NOBMI2-NEXT:    leaq 32(%rsi), %rcx
 ; X64-NOBMI2-NEXT:    movq %rcx, (%rdx)
 ; X64-NOBMI2-NEXT:    negq %rcx
 ; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
diff --git a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
index 2485599f..b8b3440 100644
--- a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
+++ b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
@@ -61,11 +61,13 @@
 ;
 ; X86_64-LABEL: t0:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    movl %esi, %eax
-; X86_64-NEXT:    addl %edx, %edi
-; X86_64-NEXT:    addl %edx, %eax
-; X86_64-NEXT:    cmpb %al, %dil
-; X86_64-NEXT:    cmovgl %edi, %eax
+; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X86_64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X86_64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X86_64-NEXT:    leal (%rdi,%rdx), %ecx
+; X86_64-NEXT:    leal (%rsi,%rdx), %eax
+; X86_64-NEXT:    cmpb %al, %cl
+; X86_64-NEXT:    cmovgl %ecx, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
   %a1_wide = add i32 %a1_wide_orig, %inc
@@ -134,11 +136,13 @@
 ;
 ; X86_64-LABEL: neg_only_one_truncation:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    addl %edx, %edi
+; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X86_64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X86_64-NEXT:    leal (%rdi,%rdx), %ecx
 ; X86_64-NEXT:    addb %sil, %dl
-; X86_64-NEXT:    cmpb %dl, %dil
+; X86_64-NEXT:    cmpb %dl, %cl
 ; X86_64-NEXT:    movzbl %dl, %eax
-; X86_64-NEXT:    cmovgl %edi, %eax
+; X86_64-NEXT:    cmovgl %ecx, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
   %a1_wide = add i32 %a1_wide_orig, %inc
@@ -205,11 +209,13 @@
 ;
 ; X86_64-LABEL: neg_type_mismatch:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    movl %esi, %eax
-; X86_64-NEXT:    addl %edx, %edi
-; X86_64-NEXT:    addl %edx, %eax
-; X86_64-NEXT:    cmpb %al, %dil
-; X86_64-NEXT:    cmovgl %edi, %eax
+; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X86_64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X86_64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X86_64-NEXT:    leal (%rdi,%rdx), %ecx
+; X86_64-NEXT:    leal (%rsi,%rdx), %eax
+; X86_64-NEXT:    cmpb %al, %cl
+; X86_64-NEXT:    cmovgl %ecx, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
   %a1_wide = add i32 %a1_wide_orig, %inc
@@ -271,8 +277,9 @@
 ;
 ; X86_64-LABEL: negative_CopyFromReg:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    movl %esi, %eax
-; X86_64-NEXT:    addl %edx, %eax
+; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X86_64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X86_64-NEXT:    leal (%rsi,%rdx), %eax
 ; X86_64-NEXT:    cmpb %al, %dil
 ; X86_64-NEXT:    cmovgl %edi, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 5cfb7a3..0162a0e 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -734,11 +734,12 @@
 ; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
 ; CHECK-O3-NEXT:    movq %rcx, %rax
 ; CHECK-O3-NEXT:    imulq %rdx
-; CHECK-O3-NEXT:    addq %rcx, %rdx
-; CHECK-O3-NEXT:    movq %rdx, %rax
+; CHECK-O3-NEXT:    addq %rdx, %rcx
+; CHECK-O3-NEXT:    movq %rcx, %rax
 ; CHECK-O3-NEXT:    shrq $63, %rax
-; CHECK-O3-NEXT:    sarq $3, %rdx
-; CHECK-O3-NEXT:    addq %rdx, %rax
+; CHECK-O3-NEXT:    sarq $3, %rcx
+; CHECK-O3-NEXT:    addq %rax, %rcx
+; CHECK-O3-NEXT:    movq %rcx, %rax
 ; CHECK-O3-NEXT:    retq
   %v = load atomic i64, i64* %p unordered, align 8
   %ret = sdiv i64 %v, 15
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 7168209..9d71fe7 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -393,10 +393,10 @@
 define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm1
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
   ret <16 x i16> %res
@@ -419,11 +419,11 @@
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
+; CHECK-NEXT:    vpermt2w %ymm2, %ymm3, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -2248,11 +2248,11 @@
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
+; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,0,3,4]
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
+; CHECK-NEXT:    vpermt2q %ymm2, %ymm3, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -4029,12 +4029,12 @@
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [1,1,5,5]
+; CHECK-FAST-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
+; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [1,1,5,5]
 ; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm4, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-FAST-NEXT:    vmovapd %ymm2, %ymm0
+; CHECK-FAST-NEXT:    vpermt2pd %ymm2, %ymm3, %ymm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 8433c75..7ac0301 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -372,13 +372,13 @@
 ; X64-NEXT:    shlb $2, %al
 ; X64-NEXT:    shrb $2, %dil
 ; X64-NEXT:    andb $51, %dil
-; X64-NEXT:    orb %al, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb $85, %al
-; X64-NEXT:    addb %al, %al
-; X64-NEXT:    shrb %dil
-; X64-NEXT:    andb $85, %dil
 ; X64-NEXT:    orb %dil, %al
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andb $85, %cl
+; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    shrb %al
+; X64-NEXT:    andb $85, %al
+; X64-NEXT:    orb %cl, %al
 ; X64-NEXT:    retq
 ;
 ; X86XOP-LABEL: test_bitreverse_i8:
@@ -422,13 +422,13 @@
 ; X64-NEXT:    shlb $2, %al
 ; X64-NEXT:    shrb $2, %dil
 ; X64-NEXT:    andb $51, %dil
-; X64-NEXT:    orb %al, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb $80, %al
-; X64-NEXT:    addb %al, %al
-; X64-NEXT:    shrb %dil
-; X64-NEXT:    andb $80, %dil
 ; X64-NEXT:    orb %dil, %al
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andb $80, %cl
+; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    shrb %al
+; X64-NEXT:    andb $80, %al
+; X64-NEXT:    orb %cl, %al
 ; X64-NEXT:    shrb $4, %al
 ; X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll
index 8c86c5e..a32c309 100644
--- a/llvm/test/CodeGen/X86/bmi2.ll
+++ b/llvm/test/CodeGen/X86/bmi2.ll
@@ -305,10 +305,10 @@
 ;
 ; X64-LABEL: mulx32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    addl %edi, %edi
-; X64-NEXT:    addl %eax, %eax
+; X64-NEXT:    leal (%rsi,%rsi), %eax
 ; X64-NEXT:    imulq %rdi, %rax
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    shrq $32, %rcx
@@ -340,8 +340,8 @@
 ;
 ; X64-LABEL: mulx32_load:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addl %eax, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi), %eax
 ; X64-NEXT:    movl (%rsi), %ecx
 ; X64-NEXT:    imulq %rcx, %rax
 ; X64-NEXT:    movq %rax, %rcx
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-32.ll b/llvm/test/CodeGen/X86/bypass-slow-division-32.ll
index 988e4f9..fe4201d 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-32.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-32.ll
@@ -175,12 +175,12 @@
 ; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    shrl $31, %eax
 ; CHECK-NEXT:    sarl $3, %edx
-; CHECK-NEXT:    addl %eax, %edx
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    shll $5, %eax
 ; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    subl %eax, %ecx
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    shll $5, %edx
+; CHECK-NEXT:    addl %eax, %edx
+; CHECK-NEXT:    subl %edx, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    retl
   %resultdiv = sdiv i32 %a, 33
diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll
index 207bfee..d25baf1 100644
--- a/llvm/test/CodeGen/X86/combine-bitselect.ll
+++ b/llvm/test/CodeGen/X86/combine-bitselect.ll
@@ -991,14 +991,13 @@
 ; SSE-LABEL: bitselect_v4i1_loop:
 ; SSE:       # %bb.0: # %bb
 ; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [12,12,12,12]
-; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [12,12,12,12]
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm2
-; SSE-NEXT:    por %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pandn %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; XOP-LABEL: bitselect_v4i1_loop:
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 4ac4056..b60c1d3 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -1613,15 +1613,14 @@
 ; SSE2-NEXT:    psrlq $61, %xmm3
 ; SSE2-NEXT:    psrlq $60, %xmm2
 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT:    paddq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrlq $3, %xmm1
-; SSE2-NEXT:    psrlq $4, %xmm2
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
-; SSE2-NEXT:    xorpd %xmm1, %xmm2
-; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlq $3, %xmm2
+; SSE2-NEXT:    psrlq $4, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
+; SSE2-NEXT:    xorpd %xmm2, %xmm1
+; SSE2-NEXT:    psubq %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
@@ -1642,15 +1641,14 @@
 ; SSE41-NEXT:    psrlq $60, %xmm3
 ; SSE41-NEXT:    psrlq $61, %xmm2
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT:    paddq %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    psrlq $4, %xmm1
-; SSE41-NEXT:    psrlq $3, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    psubq %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    paddq %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlq $4, %xmm2
+; SSE41-NEXT:    psrlq $3, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    psubq %xmm2, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
@@ -1762,14 +1760,14 @@
 ; SSE2-NEXT:    psrlq $61, %xmm5
 ; SSE2-NEXT:    psrlq $60, %xmm4
 ; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
-; SSE2-NEXT:    paddq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrlq $3, %xmm1
-; SSE2-NEXT:    psrlq $4, %xmm4
-; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
-; SSE2-NEXT:    xorpd %xmm1, %xmm4
-; SSE2-NEXT:    psubq %xmm1, %xmm4
+; SSE2-NEXT:    paddq %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psrlq $3, %xmm4
+; SSE2-NEXT:    psrlq $4, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
+; SSE2-NEXT:    movapd {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
+; SSE2-NEXT:    xorpd %xmm4, %xmm1
+; SSE2-NEXT:    psubq %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; SSE2-NEXT:    psrad $31, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
@@ -1777,68 +1775,64 @@
 ; SSE2-NEXT:    psrlq $61, %xmm6
 ; SSE2-NEXT:    psrlq $60, %xmm5
 ; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
-; SSE2-NEXT:    paddq %xmm3, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
-; SSE2-NEXT:    psrlq $3, %xmm3
-; SSE2-NEXT:    psrlq $4, %xmm5
-; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
-; SSE2-NEXT:    xorpd %xmm1, %xmm5
-; SSE2-NEXT:    psubq %xmm1, %xmm5
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    paddq %xmm5, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    psrlq $3, %xmm5
+; SSE2-NEXT:    psrlq $4, %xmm3
+; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1]
+; SSE2-NEXT:    xorpd %xmm4, %xmm3
+; SSE2-NEXT:    psubq %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psrad $31, %xmm4
+; SSE41-NEXT:    psrlq $62, %xmm4
+; SSE41-NEXT:    paddq %xmm0, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    psrad $2, %xmm5
+; SSE41-NEXT:    psrlq $2, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psrad $31, %xmm4
+; SSE41-NEXT:    psrlq $62, %xmm4
+; SSE41-NEXT:    paddq %xmm2, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    psrad $2, %xmm5
+; SSE41-NEXT:    psrlq $2, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    psrlq $62, %xmm1
-; SSE41-NEXT:    paddq %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm5
-; SSE41-NEXT:    psrad $2, %xmm5
-; SSE41-NEXT:    psrlq $2, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    psrlq $62, %xmm1
-; SSE41-NEXT:    paddq %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm5
-; SSE41-NEXT:    psrad $2, %xmm5
-; SSE41-NEXT:    psrlq $2, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    movdqa %xmm4, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    psrad $31, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
 ; SSE41-NEXT:    psrlq $60, %xmm5
-; SSE41-NEXT:    psrlq $61, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    psrlq $61, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
 ; SSE41-NEXT:    paddq %xmm4, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    psrlq $4, %xmm4
 ; SSE41-NEXT:    psrlq $3, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488]
-; SSE41-NEXT:    pxor %xmm5, %xmm1
-; SSE41-NEXT:    psubq %xmm5, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    psrad $31, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    movdqa %xmm4, %xmm6
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    psubq %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    psrad $31, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT:    movdqa %xmm5, %xmm6
 ; SSE41-NEXT:    psrlq $60, %xmm6
-; SSE41-NEXT:    psrlq $61, %xmm4
-; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT:    paddq %xmm3, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm3
-; SSE41-NEXT:    psrlq $4, %xmm3
-; SSE41-NEXT:    psrlq $3, %xmm4
-; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT:    pxor %xmm5, %xmm4
-; SSE41-NEXT:    psubq %xmm5, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm3
+; SSE41-NEXT:    psrlq $61, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT:    paddq %xmm5, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    psrlq $4, %xmm5
+; SSE41-NEXT:    psrlq $3, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    psubq %xmm4, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
@@ -1968,29 +1962,28 @@
 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrld $28, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    psrld $29, %xmm3
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT:    psrld $30, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld $30, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrad $4, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    psrad $3, %xmm3
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT:    psrad $2, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm0[2,3]
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    psubd %xmm2, %xmm3
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    psrad $2, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm2
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
@@ -2340,10 +2333,10 @@
 ; SSE41-NEXT:    psrld $31, %xmm1
 ; SSE41-NEXT:    paddd %xmm0, %xmm1
 ; SSE41-NEXT:    psrad $1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psubd %xmm1, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    psubd %xmm0, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: non_splat_minus_one_divisor_2:
@@ -2480,30 +2473,28 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
 ; SSE2-NEXT:    pmulhw %xmm0, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psraw $4, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psraw $4, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT:    psrlw $15, %xmm1
-; SSE2-NEXT:    paddw %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE2-NEXT:    psrlw $15, %xmm0
+; SSE2-NEXT:    paddw %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_vec_sdiv_nonuniform3:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
 ; SSE41-NEXT:    pmulhw %xmm0, %xmm1
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psraw $8, %xmm0
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psraw $4, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT:    psrlw $15, %xmm1
-; SSE41-NEXT:    paddw %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    psrlw $15, %xmm0
+; SSE41-NEXT:    paddw %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_nonuniform3:
@@ -2918,26 +2909,25 @@
 define <16 x i8> @pr38658(<16 x i8> %x) {
 ; SSE2-LABEL: pr38658:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    psrlw $8, %xmm3
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE2-NEXT:    psrlw $8, %xmm2
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $7, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    packuswb %xmm2, %xmm3
+; SSE2-NEXT:    paddb %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $7, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: pr38658:
@@ -2948,21 +2938,20 @@
 ; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    packuswb %xmm2, %xmm1
-; SSE41-NEXT:    paddb %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT:    psraw $8, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psllw $6, %xmm2
-; SSE41-NEXT:    psllw $8, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    packuswb %xmm0, %xmm2
-; SSE41-NEXT:    psrlw $7, %xmm1
-; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    paddb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    packuswb %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $7, %xmm0
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    paddb %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: pr38658:
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index 0040ee2..0070b1a 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -93,14 +93,14 @@
 ; NO-POPCOUNT-NEXT:    shrb %al
 ; NO-POPCOUNT-NEXT:    andb $21, %al
 ; NO-POPCOUNT-NEXT:    subb %al, %dil
-; NO-POPCOUNT-NEXT:    movl %edi, %eax
-; NO-POPCOUNT-NEXT:    andb $51, %al
+; NO-POPCOUNT-NEXT:    movl %edi, %ecx
+; NO-POPCOUNT-NEXT:    andb $51, %cl
 ; NO-POPCOUNT-NEXT:    shrb $2, %dil
 ; NO-POPCOUNT-NEXT:    andb $51, %dil
-; NO-POPCOUNT-NEXT:    addb %al, %dil
-; NO-POPCOUNT-NEXT:    movl %edi, %eax
+; NO-POPCOUNT-NEXT:    addb %dil, %cl
+; NO-POPCOUNT-NEXT:    movl %ecx, %eax
 ; NO-POPCOUNT-NEXT:    shrb $4, %al
-; NO-POPCOUNT-NEXT:    addb %dil, %al
+; NO-POPCOUNT-NEXT:    addb %cl, %al
 ; NO-POPCOUNT-NEXT:    andb $15, %al
 ; NO-POPCOUNT-NEXT:    retq
   %x2 = and i8 %x, 127
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 23a3d1e..6cb36ea 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -442,10 +442,9 @@
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    imulq %rcx
 ; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    shrq $63, %rcx
-; X64-NEXT:    sarq $28, %rax
-; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    shrq $63, %rax
+; X64-NEXT:    sarq $28, %rdx
+; X64-NEXT:    addq %rdx, %rax
 ; X64-NEXT:    imull $-294967296, %eax, %ecx # imm = 0xEE6B2800
 ; X64-NEXT:    subl %ecx, %edi
 ; X64-NEXT:    movl %edi, %edx
diff --git a/llvm/test/CodeGen/X86/fpclamptosat.ll b/llvm/test/CodeGen/X86/fpclamptosat.ll
index dd21c08..aba2740 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat.ll
@@ -430,19 +430,18 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq __fixdfti@PLT
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmpq %rcx, %rax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    cmpq %rsi, %rax
 ; CHECK-NEXT:    movq %rdx, %rdi
 ; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovlq %rdx, %rsi
-; CHECK-NEXT:    cmovlq %rax, %rcx
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT:    cmpq %rcx, %rax
-; CHECK-NEXT:    movq $-1, %rdx
-; CHECK-NEXT:    sbbq %rsi, %rdx
-; CHECK-NEXT:    cmovgeq %rax, %rcx
-; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    cmovlq %rdx, %rcx
+; CHECK-NEXT:    cmovgeq %rsi, %rax
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; CHECK-NEXT:    cmpq %rax, %rdx
+; CHECK-NEXT:    movq $-1, %rsi
+; CHECK-NEXT:    sbbq %rcx, %rsi
+; CHECK-NEXT:    cmovgeq %rdx, %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -511,19 +510,18 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq __fixsfti@PLT
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmpq %rcx, %rax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    cmpq %rsi, %rax
 ; CHECK-NEXT:    movq %rdx, %rdi
 ; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovlq %rdx, %rsi
-; CHECK-NEXT:    cmovlq %rax, %rcx
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT:    cmpq %rcx, %rax
-; CHECK-NEXT:    movq $-1, %rdx
-; CHECK-NEXT:    sbbq %rsi, %rdx
-; CHECK-NEXT:    cmovgeq %rax, %rcx
-; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    cmovlq %rdx, %rcx
+; CHECK-NEXT:    cmovgeq %rsi, %rax
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; CHECK-NEXT:    cmpq %rax, %rdx
+; CHECK-NEXT:    movq $-1, %rsi
+; CHECK-NEXT:    sbbq %rcx, %rsi
+; CHECK-NEXT:    cmovgeq %rdx, %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -594,19 +592,18 @@
 ; CHECK-NEXT:    movzwl %di, %edi
 ; CHECK-NEXT:    callq __gnu_h2f_ieee@PLT
 ; CHECK-NEXT:    callq __fixsfti@PLT
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmpq %rcx, %rax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    cmpq %rsi, %rax
 ; CHECK-NEXT:    movq %rdx, %rdi
 ; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovlq %rdx, %rsi
-; CHECK-NEXT:    cmovlq %rax, %rcx
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT:    cmpq %rcx, %rax
-; CHECK-NEXT:    movq $-1, %rdx
-; CHECK-NEXT:    sbbq %rsi, %rdx
-; CHECK-NEXT:    cmovgeq %rax, %rcx
-; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    cmovlq %rdx, %rcx
+; CHECK-NEXT:    cmovgeq %rsi, %rax
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; CHECK-NEXT:    cmpq %rax, %rdx
+; CHECK-NEXT:    movq $-1, %rsi
+; CHECK-NEXT:    sbbq %rcx, %rsi
+; CHECK-NEXT:    cmovgeq %rdx, %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -1084,18 +1081,17 @@
 ; CHECK-NEXT:    cmovbq %rax, %rsi
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovsq %rax, %rcx
-; CHECK-NEXT:    cmoveq %rsi, %rcx
+; CHECK-NEXT:    cmovnsq %rcx, %rax
+; CHECK-NEXT:    cmoveq %rsi, %rax
 ; CHECK-NEXT:    cmovsq %rdx, %rdi
 ; CHECK-NEXT:    testq %rdi, %rdi
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    cmovnsq %rcx, %rdx
-; CHECK-NEXT:    cmpq %rax, %rcx
-; CHECK-NEXT:    cmovbeq %rax, %rcx
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; CHECK-NEXT:    movq %rcx, %rdx
+; CHECK-NEXT:    cmovnsq %rax, %rdx
+; CHECK-NEXT:    cmpq %rcx, %rax
+; CHECK-NEXT:    cmovbeq %rcx, %rax
 ; CHECK-NEXT:    cmpq $-1, %rdi
-; CHECK-NEXT:    cmovneq %rdx, %rcx
-; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -1166,18 +1162,17 @@
 ; CHECK-NEXT:    cmovbq %rax, %rsi
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovsq %rax, %rcx
-; CHECK-NEXT:    cmoveq %rsi, %rcx
+; CHECK-NEXT:    cmovnsq %rcx, %rax
+; CHECK-NEXT:    cmoveq %rsi, %rax
 ; CHECK-NEXT:    cmovsq %rdx, %rdi
 ; CHECK-NEXT:    testq %rdi, %rdi
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    cmovnsq %rcx, %rdx
-; CHECK-NEXT:    cmpq %rax, %rcx
-; CHECK-NEXT:    cmovbeq %rax, %rcx
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; CHECK-NEXT:    movq %rcx, %rdx
+; CHECK-NEXT:    cmovnsq %rax, %rdx
+; CHECK-NEXT:    cmpq %rcx, %rax
+; CHECK-NEXT:    cmovbeq %rcx, %rax
 ; CHECK-NEXT:    cmpq $-1, %rdi
-; CHECK-NEXT:    cmovneq %rdx, %rcx
-; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -1250,18 +1245,17 @@
 ; CHECK-NEXT:    cmovbq %rax, %rsi
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovsq %rax, %rcx
-; CHECK-NEXT:    cmoveq %rsi, %rcx
+; CHECK-NEXT:    cmovnsq %rcx, %rax
+; CHECK-NEXT:    cmoveq %rsi, %rax
 ; CHECK-NEXT:    cmovsq %rdx, %rdi
 ; CHECK-NEXT:    testq %rdi, %rdi
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    cmovnsq %rcx, %rdx
-; CHECK-NEXT:    cmpq %rax, %rcx
-; CHECK-NEXT:    cmovbeq %rax, %rcx
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; CHECK-NEXT:    movq %rcx, %rdx
+; CHECK-NEXT:    cmovnsq %rax, %rdx
+; CHECK-NEXT:    cmpq %rcx, %rax
+; CHECK-NEXT:    cmovbeq %rcx, %rax
 ; CHECK-NEXT:    cmpq $-1, %rdi
-; CHECK-NEXT:    cmovneq %rdx, %rcx
-; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll
index baf7be7..4805945 100644
--- a/llvm/test/CodeGen/X86/haddsub-3.ll
+++ b/llvm/test/CodeGen/X86/haddsub-3.ll
@@ -11,18 +11,18 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE2-NEXT:    addss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-SLOW-LABEL: pr26491:
 ; SSSE3-SLOW:       # %bb.0:
 ; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm1
-; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
-; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 ; SSSE3-SLOW-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll
index 74c10d3..06f3614 100644
--- a/llvm/test/CodeGen/X86/haddsub-shuf.ll
+++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll
@@ -479,8 +479,8 @@
 ; SSE3-LABEL: hadd_v8i32a:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movaps %xmm0, %xmm2
-; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
+; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
 ; SSE3-NEXT:    paddd %xmm0, %xmm2
 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
 ; SSE3-NEXT:    movdqa %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll
index 8e7cadc..19f562f 100644
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/CodeGen/X86/haddsub.ll
@@ -1777,8 +1777,8 @@
 ; SSE3-SLOW:       # %bb.0:
 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 ; SSE3-SLOW-NEXT:    retq
 ;
@@ -1786,9 +1786,8 @@
 ; SSE3-FAST:       # %bb.0:
 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
-; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-FAST-NEXT:    retq
 ;
 ; AVX-SLOW-LABEL: hadd32_4:
@@ -1818,8 +1817,8 @@
 ; SSE3-SLOW:       # %bb.0:
 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 ; SSE3-SLOW-NEXT:    retq
 ;
@@ -1827,9 +1826,8 @@
 ; SSE3-FAST:       # %bb.0:
 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
-; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-FAST-NEXT:    retq
 ;
 ; AVX-SLOW-LABEL: hadd32_8:
@@ -1861,8 +1859,8 @@
 ; SSE3-SLOW:       # %bb.0:
 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 ; SSE3-SLOW-NEXT:    retq
 ;
@@ -1870,9 +1868,8 @@
 ; SSE3-FAST:       # %bb.0:
 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
-; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-FAST-NEXT:    retq
 ;
 ; AVX-SLOW-LABEL: hadd32_16:
@@ -1904,9 +1901,8 @@
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT:    addps %xmm0, %xmm1
-; SSE3-NEXT:    haddps %xmm1, %xmm1
-; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    addps %xmm1, %xmm0
+; SSE3-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd32_4_optsize:
@@ -1928,9 +1924,8 @@
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT:    addps %xmm0, %xmm1
-; SSE3-NEXT:    haddps %xmm1, %xmm1
-; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    addps %xmm1, %xmm0
+; SSE3-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd32_8_optsize:
@@ -1953,9 +1948,8 @@
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT:    addps %xmm0, %xmm1
-; SSE3-NEXT:    haddps %xmm1, %xmm1
-; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    addps %xmm1, %xmm0
+; SSE3-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd32_16_optsize:
@@ -1978,9 +1972,8 @@
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT:    addps %xmm0, %xmm1
-; SSE3-NEXT:    haddps %xmm1, %xmm1
-; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    addps %xmm1, %xmm0
+; SSE3-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd32_4_pgso:
@@ -2002,9 +1995,8 @@
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT:    addps %xmm0, %xmm1
-; SSE3-NEXT:    haddps %xmm1, %xmm1
-; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    addps %xmm1, %xmm0
+; SSE3-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd32_8_pgso:
@@ -2027,9 +2019,8 @@
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT:    addps %xmm0, %xmm1
-; SSE3-NEXT:    haddps %xmm1, %xmm1
-; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    addps %xmm1, %xmm0
+; SSE3-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd32_16_pgso:
@@ -2052,8 +2043,8 @@
 ; SSE3-SLOW:       # %bb.0:
 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 ; SSE3-SLOW-NEXT:    retq
 ;
@@ -2061,9 +2052,8 @@
 ; SSE3-FAST:       # %bb.0:
 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
-; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-FAST-NEXT:    retq
 ;
 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32:
@@ -2097,8 +2087,8 @@
 ; SSE3-SLOW:       # %bb.0:
 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 ; SSE3-SLOW-NEXT:    retq
 ;
@@ -2106,9 +2096,8 @@
 ; SSE3-FAST:       # %bb.0:
 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
-; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-FAST-NEXT:    retq
 ;
 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
@@ -2140,8 +2129,8 @@
 ; SSE3-SLOW:       # %bb.0:
 ; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT:    addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 ; SSE3-SLOW-NEXT:    retq
 ;
@@ -2149,9 +2138,8 @@
 ; SSE3-FAST:       # %bb.0:
 ; SSE3-FAST-NEXT:    movaps %xmm0, %xmm1
 ; SSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT:    addps %xmm0, %xmm1
-; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT:    movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    addps %xmm1, %xmm0
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 ; SSE3-FAST-NEXT:    retq
 ;
 ; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32:
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
index 5a5fa77..3e97c22 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
@@ -13,9 +13,9 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    addss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -23,8 +23,8 @@
 ; SSSE3-SLOW:       # %bb.0:
 ; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 ; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm1
-; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 ; SSSE3-SLOW-NEXT:    retq
 ;
@@ -127,9 +127,9 @@
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    addss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -138,8 +138,8 @@
 ; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
 ; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 ; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm1
-; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 ; SSSE3-SLOW-NEXT:    retq
 ;
@@ -197,9 +197,9 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    addpd %xmm3, %xmm1
 ; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE2-NEXT:    addsd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -207,9 +207,9 @@
 ; SSSE3-SLOW:       # %bb.0:
 ; SSSE3-SLOW-NEXT:    addpd %xmm3, %xmm1
 ; SSSE3-SLOW-NEXT:    addpd %xmm2, %xmm1
-; SSSE3-SLOW-NEXT:    addpd %xmm0, %xmm1
-; SSSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
-; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSSE3-SLOW-NEXT:    addpd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
 ; SSSE3-SLOW-NEXT:    retq
 ;
@@ -217,9 +217,8 @@
 ; SSSE3-FAST:       # %bb.0:
 ; SSSE3-FAST-NEXT:    addpd %xmm3, %xmm1
 ; SSSE3-FAST-NEXT:    addpd %xmm2, %xmm1
-; SSSE3-FAST-NEXT:    addpd %xmm0, %xmm1
-; SSSE3-FAST-NEXT:    haddpd %xmm1, %xmm1
-; SSSE3-FAST-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-FAST-NEXT:    addpd %xmm1, %xmm0
+; SSSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
 ; SSSE3-FAST-NEXT:    retq
 ;
 ; AVX1-SLOW-LABEL: PR37890_v8f64:
@@ -267,34 +266,34 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    addps %xmm3, %xmm1
 ; SSE2-NEXT:    addps %xmm2, %xmm1
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT:    addps %xmm1, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1]
-; SSE2-NEXT:    addss %xmm2, %xmm0
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE2-NEXT:    addss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-SLOW-LABEL: PR37890_v16f32:
 ; SSSE3-SLOW:       # %bb.0:
 ; SSSE3-SLOW-NEXT:    addps %xmm3, %xmm1
 ; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm1
-; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm1
-; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm2
-; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm2
-; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-SLOW-NEXT:    addss %xmm2, %xmm0
+; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
 ; SSSE3-SLOW-NEXT:    retq
 ;
 ; SSSE3-FAST-LABEL: PR37890_v16f32:
 ; SSSE3-FAST:       # %bb.0:
 ; SSSE3-FAST-NEXT:    addps %xmm3, %xmm1
 ; SSSE3-FAST-NEXT:    addps %xmm2, %xmm1
-; SSSE3-FAST-NEXT:    addps %xmm0, %xmm1
-; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm0
-; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSSE3-FAST-NEXT:    addps %xmm1, %xmm0
+; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm1
+; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSSE3-FAST-NEXT:    addps %xmm1, %xmm0
 ; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
 ; SSSE3-FAST-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 98414c3..3565165 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -535,53 +535,53 @@
 define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
 ; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32:
 ; SSSE3-SLOW:       # %bb.0:
-; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm4
-; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm4
 ; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm5
-; SSSE3-SLOW-NEXT:    unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSSE3-SLOW-NEXT:    haddps %xmm1, %xmm5
+; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm4
+; SSSE3-SLOW-NEXT:    unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
 ; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
 ; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm0
-; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1]
-; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3]
-; SSSE3-SLOW-NEXT:    addps %xmm4, %xmm5
+; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1]
+; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
+; SSSE3-SLOW-NEXT:    addps %xmm5, %xmm4
 ; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
-; SSSE3-SLOW-NEXT:    addps %xmm5, %xmm1
+; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm4
 ; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; SSSE3-SLOW-NEXT:    addps %xmm3, %xmm0
-; SSSE3-SLOW-NEXT:    movaps %xmm3, %xmm2
-; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm2
+; SSSE3-SLOW-NEXT:    movaps %xmm3, %xmm1
+; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
+; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm1
 ; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm3
-; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
-; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
-; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm3
+; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
+; SSSE3-SLOW-NEXT:    movaps %xmm4, %xmm0
 ; SSSE3-SLOW-NEXT:    retq
 ;
 ; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32:
 ; SSSE3-FAST:       # %bb.0:
-; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm4
-; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm4
 ; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm5
-; SSSE3-FAST-NEXT:    unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm5
+; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm4
+; SSSE3-FAST-NEXT:    unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
 ; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3]
+; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
 ; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
 ; SSSE3-FAST-NEXT:    haddps %xmm2, %xmm2
-; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1]
-; SSSE3-FAST-NEXT:    addps %xmm4, %xmm5
-; SSSE3-FAST-NEXT:    addps %xmm5, %xmm1
+; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1]
+; SSSE3-FAST-NEXT:    addps %xmm5, %xmm4
+; SSSE3-FAST-NEXT:    addps %xmm1, %xmm4
 ; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm0
 ; SSSE3-FAST-NEXT:    haddps %xmm3, %xmm0
-; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm2
-; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; SSSE3-FAST-NEXT:    addps %xmm0, %xmm2
+; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm1
+; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
+; SSSE3-FAST-NEXT:    addps %xmm0, %xmm1
 ; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; SSSE3-FAST-NEXT:    addps %xmm2, %xmm3
-; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
-; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
-; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-FAST-NEXT:    addps %xmm1, %xmm3
+; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
+; SSSE3-FAST-NEXT:    movaps %xmm4, %xmm0
 ; SSSE3-FAST-NEXT:    retq
 ;
 ; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32:
@@ -961,13 +961,13 @@
 ; SSSE3-SLOW:       # %bb.0:
 ; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm4
 ; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
-; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm4
-; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-SLOW-NEXT:    addps %xmm4, %xmm0
+; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
 ; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm5
 ; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
 ; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm5
 ; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; SSSE3-SLOW-NEXT:    movaps %xmm2, %xmm1
 ; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 ; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm1
@@ -976,10 +976,10 @@
 ; SSSE3-SLOW-NEXT:    addps %xmm3, %xmm2
 ; SSSE3-SLOW-NEXT:    movaps %xmm2, %xmm3
 ; SSSE3-SLOW-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
+; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
 ; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
-; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0]
 ; SSSE3-SLOW-NEXT:    addps %xmm4, %xmm0
 ; SSSE3-SLOW-NEXT:    retq
 ;
@@ -1050,12 +1050,12 @@
 ; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32:
 ; SSSE3-SLOW:       # %bb.0:
 ; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm4
-; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
+; SSSE3-SLOW-NEXT:    paddd %xmm4, %xmm0
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
 ; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
 ; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm5
 ; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
-; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
 ; SSSE3-SLOW-NEXT:    paddd %xmm2, %xmm1
 ; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
@@ -1063,12 +1063,11 @@
 ; SSSE3-SLOW-NEXT:    paddd %xmm3, %xmm6
 ; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
 ; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSSE3-SLOW-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-SLOW-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
 ; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSSE3-SLOW-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
-; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm4
-; SSSE3-SLOW-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-SLOW-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSSE3-SLOW-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-SLOW-NEXT:    paddd %xmm4, %xmm0
 ; SSSE3-SLOW-NEXT:    retq
 ;
 ; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
index c094920..01d6c0e 100644
--- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll
+++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
@@ -194,11 +194,11 @@
 ; X64-LABEL: lshr_ctlz_cmpeq_zero_v2i64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pxor %xmm1, %xmm1
-; X64-NEXT:    pcmpeqd %xmm0, %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
-; X64-NEXT:    pand %xmm1, %xmm2
-; X64-NEXT:    pcmpeqd %xmm0, %xmm0
-; X64-NEXT:    pxor %xmm2, %xmm0
+; X64-NEXT:    pcmpeqd %xmm1, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    pxor %xmm1, %xmm0
 ; X64-NEXT:    retq
   %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0)
   %lshr = lshr <2 x i64> %ctlz, <i64 6, i64 6>
@@ -232,8 +232,8 @@
 ; X64-LABEL: lshr_ctlz_cmpne_zero_v2i64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pxor %xmm1, %xmm1
-; X64-NEXT:    pcmpeqd %xmm0, %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
+; X64-NEXT:    pcmpeqd %xmm1, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
 ; X64-NEXT:    pand %xmm1, %xmm0
 ; X64-NEXT:    retq
   %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0)
diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll
index 16e684b..88d8aa7 100644
--- a/llvm/test/CodeGen/X86/nontemporal-loads.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll
@@ -1764,10 +1764,10 @@
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm7
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm6
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm8, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn (%rdi), %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm8
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    pandn (%rdi), %xmm8
+; SSE2-NEXT:    por %xmm8, %xmm0
 ; SSE2-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NEXT:    pandn 16(%rdi), %xmm5
 ; SSE2-NEXT:    por %xmm5, %xmm1
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 2ba5cb9..85ed998 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -159,10 +159,10 @@
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE4-NEXT:    por %xmm2, %xmm1
-; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [1431655765,858993459,715827882,477218588]
-; SSE4-NEXT:    pminud %xmm1, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE4-NEXT:    por %xmm2, %xmm0
+; SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [1431655765,858993459,715827882,477218588]
+; SSE4-NEXT:    pminud %xmm0, %xmm1
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index c34f724..8ac9357 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -820,13 +820,11 @@
 define <8 x i32> @zext_mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: zext_mulhuw_v8i16_lshr:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pmulhuw %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: zext_mulhuw_v8i16_lshr:
@@ -854,13 +852,11 @@
 define <8 x i32> @mulhsw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: mulhsw_v8i16_lshr:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pmulhw %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pmulhw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: mulhsw_v8i16_lshr:
@@ -920,18 +916,17 @@
 define <16 x i32> @zext_mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: zext_mulhuw_v16i16_lshr:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pmulhuw %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pmulhuw %xmm2, %xmm4
 ; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE2-NEXT:    pmulhuw %xmm3, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pmulhuw %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: zext_mulhuw_v16i16_lshr:
@@ -971,18 +966,17 @@
 define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: mulhsw_v16i16_lshr:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pmulhw %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pmulhw %xmm2, %xmm4
 ; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE2-NEXT:    pmulhw %xmm3, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pmulhw %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: mulhsw_v16i16_lshr:
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index ad0fb4c..010a615 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -31,14 +31,14 @@
 ; X64-NEXT:    shrb %al
 ; X64-NEXT:    andb $85, %al
 ; X64-NEXT:    subb %al, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb $51, %al
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    andb $51, %cl
 ; X64-NEXT:    shrb $2, %dil
 ; X64-NEXT:    andb $51, %dil
-; X64-NEXT:    addb %al, %dil
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    addb %dil, %cl
+; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    shrb $4, %al
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    addb %cl, %al
 ; X64-NEXT:    andb $15, %al
 ; X64-NEXT:    retq
 ;
@@ -225,15 +225,15 @@
 ; X64-NEXT:    movq %rdi, %rcx
 ; X64-NEXT:    andq %rax, %rcx
 ; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    shrq $4, %rcx
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    andq %rcx, %rdx
 ; X64-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    imulq %rdx, %rax
 ; X64-NEXT:    shrq $56, %rax
 ; X64-NEXT:    retq
 ;
@@ -385,36 +385,36 @@
 ; X64-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
 ; X64-NEXT:    andq %r8, %rax
 ; X64-NEXT:    subq %rax, %rsi
-; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    andq %rcx, %rax
 ; X64-NEXT:    shrq $2, %rsi
-; X64-NEXT:    andq %rax, %rsi
-; X64-NEXT:    addq %rcx, %rsi
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    addq %rsi, %rcx
+; X64-NEXT:    andq %rcx, %rsi
+; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    shrq $4, %rdx
+; X64-NEXT:    addq %rax, %rdx
 ; X64-NEXT:    movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %r9, %rcx
-; X64-NEXT:    movabsq $72340172838076673, %rdx # imm = 0x101010101010101
-; X64-NEXT:    imulq %rdx, %rcx
-; X64-NEXT:    shrq $56, %rcx
-; X64-NEXT:    movq %rdi, %rsi
-; X64-NEXT:    shrq %rsi
-; X64-NEXT:    andq %r8, %rsi
-; X64-NEXT:    subq %rsi, %rdi
-; X64-NEXT:    movq %rdi, %rsi
-; X64-NEXT:    andq %rax, %rsi
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    addq %rsi, %rdi
+; X64-NEXT:    andq %r9, %rdx
+; X64-NEXT:    movabsq $72340172838076673, %rsi # imm = 0x101010101010101
+; X64-NEXT:    imulq %rsi, %rdx
+; X64-NEXT:    shrq $56, %rdx
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    andq %r8, %rax
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    shrq $2, %rdi
+; X64-NEXT:    andq %rdi, %rcx
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    andq %r9, %rax
-; X64-NEXT:    imulq %rdx, %rax
-; X64-NEXT:    shrq $56, %rax
 ; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    andq %r9, %rax
+; X64-NEXT:    imulq %rsi, %rax
+; X64-NEXT:    shrq $56, %rax
+; X64-NEXT:    addq %rdx, %rax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    retq
 ;
@@ -579,15 +579,15 @@
 ; X64-NEXT:    movq %rdi, %rcx
 ; X64-NEXT:    andq %rax, %rcx
 ; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    shrq $4, %rcx
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    andq %rcx, %rdx
 ; X64-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    imulq %rdx, %rax
 ; X64-NEXT:    shrq $56, %rax
 ; X64-NEXT:    retq
 ;
@@ -721,15 +721,15 @@
 ; X64-NEXT:    movq %rdi, %rcx
 ; X64-NEXT:    andq %rax, %rcx
 ; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    shrq $4, %rcx
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    andq %rcx, %rdx
 ; X64-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    imulq %rdx, %rax
 ; X64-NEXT:    shrq $56, %rax
 ; X64-NEXT:    retq
 ;
@@ -890,36 +890,36 @@
 ; X64-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
 ; X64-NEXT:    andq %r8, %rax
 ; X64-NEXT:    subq %rax, %rsi
-; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    andq %rcx, %rax
 ; X64-NEXT:    shrq $2, %rsi
-; X64-NEXT:    andq %rax, %rsi
-; X64-NEXT:    addq %rcx, %rsi
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    addq %rsi, %rcx
+; X64-NEXT:    andq %rcx, %rsi
+; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    shrq $4, %rdx
+; X64-NEXT:    addq %rax, %rdx
 ; X64-NEXT:    movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %r9, %rcx
-; X64-NEXT:    movabsq $72340172838076673, %rdx # imm = 0x101010101010101
-; X64-NEXT:    imulq %rdx, %rcx
-; X64-NEXT:    shrq $56, %rcx
-; X64-NEXT:    movq %rdi, %rsi
-; X64-NEXT:    shrq %rsi
-; X64-NEXT:    andq %r8, %rsi
-; X64-NEXT:    subq %rsi, %rdi
-; X64-NEXT:    movq %rdi, %rsi
-; X64-NEXT:    andq %rax, %rsi
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    addq %rsi, %rdi
+; X64-NEXT:    andq %r9, %rdx
+; X64-NEXT:    movabsq $72340172838076673, %rsi # imm = 0x101010101010101
+; X64-NEXT:    imulq %rsi, %rdx
+; X64-NEXT:    shrq $56, %rdx
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    andq %r8, %rax
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    shrq $2, %rdi
+; X64-NEXT:    andq %rdi, %rcx
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    andq %r9, %rax
-; X64-NEXT:    imulq %rdx, %rax
-; X64-NEXT:    shrq $56, %rax
 ; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    andq %r9, %rax
+; X64-NEXT:    imulq %rsi, %rax
+; X64-NEXT:    shrq $56, %rax
+; X64-NEXT:    addq %rdx, %rax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    retq
 ;
@@ -1151,15 +1151,15 @@
 ; X64-NEXT:    movq %rdi, %rcx
 ; X64-NEXT:    andq %rax, %rcx
 ; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    shrq $4, %rcx
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    andq %rcx, %rdx
 ; X64-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    imulq %rdx, %rax
 ; X64-NEXT:    shrq $56, %rax
 ; X64-NEXT:    retq
 ;
@@ -1320,36 +1320,36 @@
 ; X64-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
 ; X64-NEXT:    andq %r8, %rax
 ; X64-NEXT:    subq %rax, %rsi
-; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    andq %rcx, %rax
 ; X64-NEXT:    shrq $2, %rsi
-; X64-NEXT:    andq %rax, %rsi
-; X64-NEXT:    addq %rcx, %rsi
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    addq %rsi, %rcx
+; X64-NEXT:    andq %rcx, %rsi
+; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    shrq $4, %rdx
+; X64-NEXT:    addq %rax, %rdx
 ; X64-NEXT:    movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %r9, %rcx
-; X64-NEXT:    movabsq $72340172838076673, %rdx # imm = 0x101010101010101
-; X64-NEXT:    imulq %rdx, %rcx
-; X64-NEXT:    shrq $56, %rcx
-; X64-NEXT:    movq %rdi, %rsi
-; X64-NEXT:    shrq %rsi
-; X64-NEXT:    andq %r8, %rsi
-; X64-NEXT:    subq %rsi, %rdi
-; X64-NEXT:    movq %rdi, %rsi
-; X64-NEXT:    andq %rax, %rsi
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    addq %rsi, %rdi
+; X64-NEXT:    andq %r9, %rdx
+; X64-NEXT:    movabsq $72340172838076673, %rsi # imm = 0x101010101010101
+; X64-NEXT:    imulq %rsi, %rdx
+; X64-NEXT:    shrq $56, %rdx
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    andq %r8, %rax
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    shrq $2, %rdi
+; X64-NEXT:    andq %rdi, %rcx
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    andq %r9, %rax
-; X64-NEXT:    imulq %rdx, %rax
-; X64-NEXT:    shrq $56, %rax
 ; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    andq %r9, %rax
+; X64-NEXT:    imulq %rsi, %rax
+; X64-NEXT:    shrq $56, %rax
+; X64-NEXT:    addq %rdx, %rax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
index 28f6c9f..180b609 100644
--- a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
+++ b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
@@ -140,9 +140,9 @@
 define i32 @add_signbit_shl(i32 %x, i32* %dst) {
 ; X64-LABEL: add_signbit_shl:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $8, %eax
-; X64-NEXT:    addl $-16777216, %eax # imm = 0xFF000000
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    shll $8, %edi
+; X64-NEXT:    leal -16777216(%rdi), %eax
 ; X64-NEXT:    movl %eax, (%rsi)
 ; X64-NEXT:    retq
 ;
@@ -162,9 +162,9 @@
 define i32 @add_nosignbit_shl(i32 %x, i32* %dst) {
 ; X64-LABEL: add_nosignbit_shl:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $8, %eax
-; X64-NEXT:    addl $-16777216, %eax # imm = 0xFF000000
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    shll $8, %edi
+; X64-NEXT:    leal -16777216(%rdi), %eax
 ; X64-NEXT:    movl %eax, (%rsi)
 ; X64-NEXT:    retq
 ;
@@ -322,8 +322,8 @@
 define i32 @add_signbit_lshr(i32 %x, i32* %dst) {
 ; X64-LABEL: add_signbit_lshr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addl $-65536, %eax # imm = 0xFFFF0000
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal -65536(%rdi), %eax
 ; X64-NEXT:    shrl $8, %eax
 ; X64-NEXT:    movl %eax, (%rsi)
 ; X64-NEXT:    retq
@@ -344,8 +344,8 @@
 define i32 @add_nosignbit_lshr(i32 %x, i32* %dst) {
 ; X64-LABEL: add_nosignbit_lshr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addl $2147418112, %eax # imm = 0x7FFF0000
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal 2147418112(%rdi), %eax
 ; X64-NEXT:    shrl $8, %eax
 ; X64-NEXT:    movl %eax, (%rsi)
 ; X64-NEXT:    retq
@@ -503,8 +503,8 @@
 define i32 @add_signbit_ashr(i32 %x, i32* %dst) {
 ; X64-LABEL: add_signbit_ashr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addl $-65536, %eax # imm = 0xFFFF0000
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal -65536(%rdi), %eax
 ; X64-NEXT:    sarl $8, %eax
 ; X64-NEXT:    movl %eax, (%rsi)
 ; X64-NEXT:    retq
@@ -525,8 +525,8 @@
 define i32 @add_nosignbit_ashr(i32 %x, i32* %dst) {
 ; X64-LABEL: add_nosignbit_ashr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addl $2147418112, %eax # imm = 0x7FFF0000
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal 2147418112(%rdi), %eax
 ; X64-NEXT:    sarl $8, %eax
 ; X64-NEXT:    movl %eax, (%rsi)
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/rem.ll b/llvm/test/CodeGen/X86/rem.ll
index c3f6d61..893b49f 100644
--- a/llvm/test/CodeGen/X86/rem.ll
+++ b/llvm/test/CodeGen/X86/rem.ll
@@ -8,16 +8,15 @@
 ; CHECK-NEXT:    movl $-2139062143, %edx # imm = 0x80808081
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    imull %edx
-; CHECK-NEXT:    addl %ecx, %edx
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    shrl $31, %eax
-; CHECK-NEXT:    sarl $7, %edx
-; CHECK-NEXT:    addl %eax, %edx
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    shll $8, %eax
-; CHECK-NEXT:    subl %eax, %edx
-; CHECK-NEXT:    addl %edx, %ecx
-; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    leal (%edx,%ecx), %eax
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    shrl $31, %edx
+; CHECK-NEXT:    sarl $7, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    shll $8, %edx
+; CHECK-NEXT:    subl %edx, %eax
+; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    retl
   %tmp1 = srem i32 %X, 255
   ret i32 %tmp1
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index 81474dd..3feff29 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -1265,11 +1265,12 @@
 ; SSE42-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT:    paddq %xmm0, %xmm1
-; SSE42-NEXT:    pxor %xmm2, %xmm0
-; SSE42-NEXT:    pxor %xmm1, %xmm2
-; SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; SSE42-NEXT:    por %xmm1, %xmm0
+; SSE42-NEXT:    movdqa %xmm0, %xmm3
+; SSE42-NEXT:    pxor %xmm2, %xmm3
+; SSE42-NEXT:    paddq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm0, %xmm2
+; SSE42-NEXT:    pcmpgtq %xmm2, %xmm3
+; SSE42-NEXT:    por %xmm3, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index 192a124..44acdc7 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -29,15 +29,15 @@
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    setne %dl
 ; X64-NEXT:    testb %cl, %dl
-; X64-NEXT:    cmovel %eax, %edi
-; X64-NEXT:    cmpl $65535, %edi # imm = 0xFFFF
+; X64-NEXT:    cmovnel %edi, %eax
+; X64-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X64-NEXT:    movl $65535, %ecx # imm = 0xFFFF
-; X64-NEXT:    cmovll %edi, %ecx
-; X64-NEXT:    cmpl $-65535, %ecx # imm = 0xFFFF0001
-; X64-NEXT:    movl $-65536, %eax # imm = 0xFFFF0000
 ; X64-NEXT:    cmovgel %ecx, %eax
+; X64-NEXT:    cmpl $-65535, %eax # imm = 0xFFFF0001
+; X64-NEXT:    movl $-65536, %ecx # imm = 0xFFFF0000
+; X64-NEXT:    cmovll %ecx, %eax
 ; X64-NEXT:    shrl %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: func:
@@ -45,14 +45,14 @@
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $8, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %edi
-; X86-NEXT:    leal -1(%eax), %esi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    idivl %esi
+; X86-NEXT:    leal -1(%eax), %edi
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    sets %cl
@@ -60,13 +60,13 @@
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testb %cl, %dl
-; X86-NEXT:    cmovel %eax, %esi
-; X86-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
+; X86-NEXT:    cmovnel %edi, %eax
+; X86-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-NEXT:    movl $65535, %ecx # imm = 0xFFFF
-; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmpl $-65535, %ecx # imm = 0xFFFF0001
-; X86-NEXT:    movl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    cmovgel %ecx, %eax
+; X86-NEXT:    cmpl $-65535, %eax # imm = 0xFFFF0001
+; X86-NEXT:    movl $-65536, %ecx # imm = 0xFFFF0000
+; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
@@ -99,14 +99,14 @@
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    setne %dl
 ; X64-NEXT:    testb %cl, %dl
-; X64-NEXT:    cmovel %eax, %edi
-; X64-NEXT:    cmpl $16383, %edi # imm = 0x3FFF
+; X64-NEXT:    cmovnel %edi, %eax
+; X64-NEXT:    cmpl $16383, %eax # imm = 0x3FFF
 ; X64-NEXT:    movl $16383, %ecx # imm = 0x3FFF
-; X64-NEXT:    cmovll %edi, %ecx
-; X64-NEXT:    cmpl $-16383, %ecx # imm = 0xC001
-; X64-NEXT:    movl $-16384, %eax # imm = 0xC000
 ; X64-NEXT:    cmovgel %ecx, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    cmpl $-16383, %eax # imm = 0xC001
+; X64-NEXT:    movl $-16384, %ecx # imm = 0xC000
+; X64-NEXT:    cmovll %ecx, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: func2:
@@ -114,14 +114,14 @@
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $14, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %edi
-; X86-NEXT:    leal -1(%eax), %esi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    idivl %esi
+; X86-NEXT:    leal -1(%eax), %edi
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    sets %cl
@@ -129,13 +129,13 @@
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testb %cl, %dl
-; X86-NEXT:    cmovel %eax, %esi
-; X86-NEXT:    cmpl $16383, %esi # imm = 0x3FFF
+; X86-NEXT:    cmovnel %edi, %eax
+; X86-NEXT:    cmpl $16383, %eax # imm = 0x3FFF
 ; X86-NEXT:    movl $16383, %ecx # imm = 0x3FFF
-; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmpl $-16383, %ecx # imm = 0xC001
-; X86-NEXT:    movl $-16384, %eax # imm = 0xC000
 ; X86-NEXT:    cmovgel %ecx, %eax
+; X86-NEXT:    cmpl $-16383, %eax # imm = 0xC001
+; X86-NEXT:    movl $-16384, %ecx # imm = 0xC000
+; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -169,16 +169,16 @@
 ; X64-NEXT:    testw %dx, %dx
 ; X64-NEXT:    setne %dl
 ; X64-NEXT:    testb %cl, %dl
-; X64-NEXT:    cmovel %eax, %esi
-; X64-NEXT:    movswl %si, %eax
-; X64-NEXT:    cmpl $16383, %eax # imm = 0x3FFF
+; X64-NEXT:    cmovnel %esi, %eax
+; X64-NEXT:    movswl %ax, %ecx
+; X64-NEXT:    cmpl $16383, %ecx # imm = 0x3FFF
 ; X64-NEXT:    movl $16383, %ecx # imm = 0x3FFF
-; X64-NEXT:    cmovll %esi, %ecx
-; X64-NEXT:    movswl %cx, %eax
-; X64-NEXT:    cmpl $-16383, %eax # imm = 0xC001
-; X64-NEXT:    movl $49152, %eax # imm = 0xC000
 ; X64-NEXT:    cmovgel %ecx, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    movswl %ax, %ecx
+; X64-NEXT:    cmpl $-16383, %ecx # imm = 0xC001
+; X64-NEXT:    movl $49152, %ecx # imm = 0xC000
+; X64-NEXT:    cmovll %ecx, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: func3:
@@ -188,31 +188,31 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
-; X86-NEXT:    movswl %ax, %edi
+; X86-NEXT:    movswl %ax, %esi
 ; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    shrl $4, %edi
+; X86-NEXT:    shrl $4, %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %di
+; X86-NEXT:    idivw %si
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    leal -1(%eax), %esi
+; X86-NEXT:    leal -1(%eax), %edi
 ; X86-NEXT:    testw %cx, %cx
 ; X86-NEXT:    sets %cl
-; X86-NEXT:    testw %di, %di
+; X86-NEXT:    testw %si, %si
 ; X86-NEXT:    sets %ch
 ; X86-NEXT:    xorb %cl, %ch
 ; X86-NEXT:    testw %dx, %dx
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    testb %ch, %cl
-; X86-NEXT:    cmovel %eax, %esi
-; X86-NEXT:    movswl %si, %eax
-; X86-NEXT:    cmpl $16383, %eax # imm = 0x3FFF
+; X86-NEXT:    cmovnel %edi, %eax
+; X86-NEXT:    movswl %ax, %ecx
+; X86-NEXT:    cmpl $16383, %ecx # imm = 0x3FFF
 ; X86-NEXT:    movl $16383, %ecx # imm = 0x3FFF
-; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    movswl %cx, %eax
-; X86-NEXT:    cmpl $-16383, %eax # imm = 0xC001
-; X86-NEXT:    movl $49152, %eax # imm = 0xC000
 ; X86-NEXT:    cmovgel %ecx, %eax
+; X86-NEXT:    movswl %ax, %ecx
+; X86-NEXT:    cmpl $-16383, %ecx # imm = 0xC001
+; X86-NEXT:    movl $49152, %ecx # imm = 0xC000
+; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -521,13 +521,14 @@
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    setne %dl
 ; X64-NEXT:    testb %cl, %dl
-; X64-NEXT:    cmovel %eax, %edi
-; X64-NEXT:    cmpl $131071, %edi # imm = 0x1FFFF
+; X64-NEXT:    cmovnel %edi, %eax
+; X64-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
 ; X64-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
-; X64-NEXT:    cmovll %edi, %ecx
-; X64-NEXT:    cmpl $-131071, %ecx # imm = 0xFFFE0001
-; X64-NEXT:    movl $-131072, %eax # imm = 0xFFFE0000
 ; X64-NEXT:    cmovgel %ecx, %eax
+; X64-NEXT:    cmpl $-131071, %eax # imm = 0xFFFE0001
+; X64-NEXT:    movl $-131072, %ecx # imm = 0xFFFE0000
+; X64-NEXT:    cmovll %ecx, %eax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: func6:
@@ -535,14 +536,14 @@
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $7, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %edi
-; X86-NEXT:    leal -1(%eax), %esi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    idivl %esi
+; X86-NEXT:    leal -1(%eax), %edi
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    sets %cl
@@ -550,13 +551,13 @@
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testb %cl, %dl
-; X86-NEXT:    cmovel %eax, %esi
-; X86-NEXT:    cmpl $131071, %esi # imm = 0x1FFFF
+; X86-NEXT:    cmovnel %edi, %eax
+; X86-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
 ; X86-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
-; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmpl $-131071, %ecx # imm = 0xFFFE0001
-; X86-NEXT:    movl $-131072, %eax # imm = 0xFFFE0000
 ; X86-NEXT:    cmovgel %ecx, %eax
+; X86-NEXT:    cmpl $-131071, %eax # imm = 0xFFFE0001
+; X86-NEXT:    movl $-131072, %ecx # imm = 0xFFFE0000
+; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll
index 600a123..85b920b 100644
--- a/llvm/test/CodeGen/X86/setcc-combine.ll
+++ b/llvm/test/CodeGen/X86/setcc-combine.ll
@@ -326,11 +326,10 @@
 ; SSE2-LABEL: sub_to_shift_to_add_vec:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    paddd %xmm1, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: sub_to_shift_to_add_vec:
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index e1f6f3e..b0e45f61 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -317,9 +317,9 @@
 ;
 ; X64-LABEL: ashr_add_shl_i32_i8_extra_use1:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $24, %eax
-; X64-NEXT:    addl $33554432, %eax # imm = 0x2000000
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    shll $24, %edi
+; X64-NEXT:    leal 33554432(%rdi), %eax
 ; X64-NEXT:    movl %eax, (%rsi)
 ; X64-NEXT:    sarl $24, %eax
 ; X64-NEXT:    retq
@@ -371,10 +371,10 @@
 ;
 ; X64-LABEL: ashr_add_shl_i32_i8_extra_use3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $24, %eax
-; X64-NEXT:    movl %eax, (%rsi)
-; X64-NEXT:    addl $33554432, %eax # imm = 0x2000000
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    shll $24, %edi
+; X64-NEXT:    movl %edi, (%rsi)
+; X64-NEXT:    leal 33554432(%rdi), %eax
 ; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    sarl $24, %eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/shl-crash-on-legalize.ll b/llvm/test/CodeGen/X86/shl-crash-on-legalize.ll
index 301c26e..0168fd9 100644
--- a/llvm/test/CodeGen/X86/shl-crash-on-legalize.ll
+++ b/llvm/test/CodeGen/X86/shl-crash-on-legalize.ll
@@ -11,14 +11,13 @@
 define i32 @PR29058(i8 %x, i32 %y) {
 ; CHECK-LABEL: PR29058:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:    testb %dil, %dil
 ; CHECK-NEXT:    movl $2147483646, %eax # imm = 0x7FFFFFFE
 ; CHECK-NEXT:    cmovnel %esi, %eax
-; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    cmpb $1, %dil
-; CHECK-NEXT:    sbbl %edx, %edx
-; CHECK-NEXT:    orb %dl, %cl
+; CHECK-NEXT:    sbbl %ecx, %ecx
+; CHECK-NEXT:    orb %sil, %cl
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, %eax
 ; CHECK-NEXT:    movq %rax, structMember(%rip)
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 5937b82..5009056 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -464,11 +464,10 @@
 define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
 ; SLM32-LABEL: test_mul_v8i32_v8i16:
 ; SLM32:       # %bb.0:
-; SLM32-NEXT:    movdqa %xmm0, %xmm1
-; SLM32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM32-NEXT:    movdqa %xmm1, %xmm2
+; SLM32-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLM32-NEXT:    movdqa %xmm0, %xmm2
+; SLM32-NEXT:    pmulhuw %xmm1, %xmm2
 ; SLM32-NEXT:    pmullw %xmm0, %xmm1
-; SLM32-NEXT:    pmulhuw %xmm0, %xmm2
 ; SLM32-NEXT:    movdqa %xmm1, %xmm0
 ; SLM32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -476,11 +475,10 @@
 ;
 ; SLM64-LABEL: test_mul_v8i32_v8i16:
 ; SLM64:       # %bb.0:
-; SLM64-NEXT:    movdqa %xmm0, %xmm1
-; SLM64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM64-NEXT:    movdqa %xmm1, %xmm2
+; SLM64-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLM64-NEXT:    movdqa %xmm0, %xmm2
+; SLM64-NEXT:    pmulhuw %xmm1, %xmm2
 ; SLM64-NEXT:    pmullw %xmm0, %xmm1
-; SLM64-NEXT:    pmulhuw %xmm0, %xmm2
 ; SLM64-NEXT:    movdqa %xmm1, %xmm0
 ; SLM64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -488,10 +486,9 @@
 ;
 ; SLOW32-LABEL: test_mul_v8i32_v8i16:
 ; SLOW32:       # %bb.0:
-; SLOW32-NEXT:    movdqa %xmm0, %xmm1
-; SLOW32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLOW32-NEXT:    movdqa %xmm1, %xmm2
-; SLOW32-NEXT:    pmulhuw %xmm0, %xmm2
+; SLOW32-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLOW32-NEXT:    movdqa %xmm0, %xmm2
+; SLOW32-NEXT:    pmulhuw %xmm1, %xmm2
 ; SLOW32-NEXT:    pmullw %xmm0, %xmm1
 ; SLOW32-NEXT:    movdqa %xmm1, %xmm0
 ; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -500,10 +497,9 @@
 ;
 ; SLOW64-LABEL: test_mul_v8i32_v8i16:
 ; SLOW64:       # %bb.0:
-; SLOW64-NEXT:    movdqa %xmm0, %xmm1
-; SLOW64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLOW64-NEXT:    movdqa %xmm1, %xmm2
-; SLOW64-NEXT:    pmulhuw %xmm0, %xmm2
+; SLOW64-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLOW64-NEXT:    movdqa %xmm0, %xmm2
+; SLOW64-NEXT:    pmulhuw %xmm1, %xmm2
 ; SLOW64-NEXT:    pmullw %xmm0, %xmm1
 ; SLOW64-NEXT:    movdqa %xmm1, %xmm0
 ; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -563,78 +559,78 @@
 define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
 ; SLM32-LABEL: test_mul_v16i32_v16i16:
 ; SLM32:       # %bb.0:
-; SLM32-NEXT:    movdqa %xmm1, %xmm3
-; SLM32-NEXT:    movdqa %xmm0, %xmm1
-; SLM32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM32-NEXT:    movdqa %xmm1, %xmm2
-; SLM32-NEXT:    movdqa %xmm3, %xmm4
-; SLM32-NEXT:    pmullw %xmm0, %xmm1
-; SLM32-NEXT:    pmulhuw %xmm0, %xmm2
-; SLM32-NEXT:    pmullw %xmm0, %xmm3
-; SLM32-NEXT:    pmulhuw %xmm0, %xmm4
-; SLM32-NEXT:    movdqa %xmm1, %xmm0
-; SLM32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SLM32-NEXT:    movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLM32-NEXT:    movdqa %xmm0, %xmm4
+; SLM32-NEXT:    movdqa %xmm0, %xmm2
+; SLM32-NEXT:    movdqa %xmm1, %xmm5
+; SLM32-NEXT:    pmullw %xmm3, %xmm4
+; SLM32-NEXT:    pmulhuw %xmm3, %xmm2
+; SLM32-NEXT:    pmulhuw %xmm3, %xmm5
+; SLM32-NEXT:    pmullw %xmm1, %xmm3
+; SLM32-NEXT:    movdqa %xmm4, %xmm0
+; SLM32-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
 ; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SLM32-NEXT:    movdqa %xmm3, %xmm2
-; SLM32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SLM32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SLM32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SLM32-NEXT:    movdqa %xmm4, %xmm1
+; SLM32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
 ; SLM32-NEXT:    retl
 ;
 ; SLM64-LABEL: test_mul_v16i32_v16i16:
 ; SLM64:       # %bb.0:
-; SLM64-NEXT:    movdqa %xmm1, %xmm3
-; SLM64-NEXT:    movdqa %xmm0, %xmm1
-; SLM64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM64-NEXT:    movdqa %xmm1, %xmm2
-; SLM64-NEXT:    movdqa %xmm3, %xmm4
-; SLM64-NEXT:    pmullw %xmm0, %xmm1
-; SLM64-NEXT:    pmulhuw %xmm0, %xmm2
-; SLM64-NEXT:    pmullw %xmm0, %xmm3
-; SLM64-NEXT:    pmulhuw %xmm0, %xmm4
-; SLM64-NEXT:    movdqa %xmm1, %xmm0
-; SLM64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SLM64-NEXT:    movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLM64-NEXT:    movdqa %xmm0, %xmm4
+; SLM64-NEXT:    movdqa %xmm0, %xmm2
+; SLM64-NEXT:    movdqa %xmm1, %xmm5
+; SLM64-NEXT:    pmullw %xmm3, %xmm4
+; SLM64-NEXT:    pmulhuw %xmm3, %xmm2
+; SLM64-NEXT:    pmulhuw %xmm3, %xmm5
+; SLM64-NEXT:    pmullw %xmm1, %xmm3
+; SLM64-NEXT:    movdqa %xmm4, %xmm0
+; SLM64-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
 ; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SLM64-NEXT:    movdqa %xmm3, %xmm2
-; SLM64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SLM64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SLM64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SLM64-NEXT:    movdqa %xmm4, %xmm1
+; SLM64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
 ; SLM64-NEXT:    retq
 ;
 ; SLOW32-LABEL: test_mul_v16i32_v16i16:
 ; SLOW32:       # %bb.0:
-; SLOW32-NEXT:    movdqa %xmm1, %xmm3
-; SLOW32-NEXT:    movdqa %xmm0, %xmm1
-; SLOW32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
 ; SLOW32-NEXT:    movdqa %xmm0, %xmm4
-; SLOW32-NEXT:    pmulhuw %xmm2, %xmm4
-; SLOW32-NEXT:    pmullw %xmm2, %xmm1
-; SLOW32-NEXT:    movdqa %xmm1, %xmm0
-; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SLOW32-NEXT:    movdqa %xmm3, %xmm4
-; SLOW32-NEXT:    pmulhuw %xmm2, %xmm4
-; SLOW32-NEXT:    pmullw %xmm2, %xmm3
+; SLOW32-NEXT:    movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLOW32-NEXT:    movdqa %xmm0, %xmm2
+; SLOW32-NEXT:    pmulhuw %xmm3, %xmm2
+; SLOW32-NEXT:    pmullw %xmm3, %xmm4
+; SLOW32-NEXT:    movdqa %xmm4, %xmm0
+; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SLOW32-NEXT:    movdqa %xmm1, %xmm5
+; SLOW32-NEXT:    pmulhuw %xmm3, %xmm5
+; SLOW32-NEXT:    pmullw %xmm1, %xmm3
 ; SLOW32-NEXT:    movdqa %xmm3, %xmm2
-; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SLOW32-NEXT:    movdqa %xmm4, %xmm1
 ; SLOW32-NEXT:    retl
 ;
 ; SLOW64-LABEL: test_mul_v16i32_v16i16:
 ; SLOW64:       # %bb.0:
-; SLOW64-NEXT:    movdqa %xmm1, %xmm3
-; SLOW64-NEXT:    movdqa %xmm0, %xmm1
-; SLOW64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
 ; SLOW64-NEXT:    movdqa %xmm0, %xmm4
-; SLOW64-NEXT:    pmulhuw %xmm2, %xmm4
-; SLOW64-NEXT:    pmullw %xmm2, %xmm1
-; SLOW64-NEXT:    movdqa %xmm1, %xmm0
-; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SLOW64-NEXT:    movdqa %xmm3, %xmm4
-; SLOW64-NEXT:    pmulhuw %xmm2, %xmm4
-; SLOW64-NEXT:    pmullw %xmm2, %xmm3
+; SLOW64-NEXT:    movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLOW64-NEXT:    movdqa %xmm0, %xmm2
+; SLOW64-NEXT:    pmulhuw %xmm3, %xmm2
+; SLOW64-NEXT:    pmullw %xmm3, %xmm4
+; SLOW64-NEXT:    movdqa %xmm4, %xmm0
+; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SLOW64-NEXT:    movdqa %xmm1, %xmm5
+; SLOW64-NEXT:    pmulhuw %xmm3, %xmm5
+; SLOW64-NEXT:    pmullw %xmm1, %xmm3
 ; SLOW64-NEXT:    movdqa %xmm3, %xmm2
-; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SLOW64-NEXT:    movdqa %xmm4, %xmm1
 ; SLOW64-NEXT:    retq
 ;
 ; SSE4-32-LABEL: test_mul_v16i32_v16i16:
diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index 45d089d..ccfc53c 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -56,12 +56,11 @@
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ebx, %eax
@@ -69,16 +68,17 @@
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    subl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    subl %ecx, %edi
 ; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    cmovnsl %edi, %ebp
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %ebp, %edx
+; X86-NEXT:    cmovsl %edi, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    cmovsl %ecx, %edx
 ; X86-NEXT:    shldl $30, %eax, %edx
 ; X86-NEXT:    shldl $30, %esi, %eax
 ; X86-NEXT:    popl %esi
@@ -333,16 +333,17 @@
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    subl %esi, %edi
 ; X86-NEXT:    testl %ebp, %ebp
-; X86-NEXT:    cmovnsl %edi, %ebx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    subl %ecx, %edx
-; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %ebx, %edx
+; X86-NEXT:    cmovsl %edi, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    cmovsl %ecx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -368,33 +369,32 @@
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    adcl %ebx, %edx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    subl %esi, %ecx
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %edi, %esi
-; X86-NEXT:    cmovnsl %edx, %ecx
+; X86-NEXT:    cmovnsl %ebx, %esi
+; X86-NEXT:    cmovnsl %edi, %ecx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %esi, %edx
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index d752e36..0463886 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -61,26 +61,27 @@
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    imull %esi
+; X86-NEXT:    imull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl %ebp, %edx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl %esi, %edx
@@ -89,40 +90,40 @@
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    sbbl $0, %ebp
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    cmovnsl %ebx, %ebp
 ; X86-NEXT:    cmovnsl %edx, %esi
 ; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    sbbl $0, %edx
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %ebp, %edi
+; X86-NEXT:    cmovnsl %ebp, %edx
 ; X86-NEXT:    cmovnsl %esi, %ecx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    setg %bl
 ; X86-NEXT:    sete %bh
 ; X86-NEXT:    cmpl $2, %ecx
-; X86-NEXT:    setae %dl
-; X86-NEXT:    andb %bh, %dl
-; X86-NEXT:    orb %bl, %dl
-; X86-NEXT:    movl (%esp), %ebx
-; X86-NEXT:    shrdl $2, %eax, %ebx
-; X86-NEXT:    shrdl $2, %ecx, %eax
-; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    setae %al
+; X86-NEXT:    andb %bh, %al
+; X86-NEXT:    orb %bl, %al
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    shrdl $2, %edi, %ebx
+; X86-NEXT:    shrdl $2, %ecx, %edi
+; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovel %eax, %esi
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    cmovel %ebx, %edx
-; X86-NEXT:    cmpl $-1, %edi
-; X86-NEXT:    setl %bl
+; X86-NEXT:    cmovel %edi, %esi
+; X86-NEXT:    movl $-1, %edi
+; X86-NEXT:    cmovel %ebx, %edi
+; X86-NEXT:    cmpl $-1, %edx
+; X86-NEXT:    setl %dl
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    cmpl $-2, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    andb %al, %cl
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    orb %bl, %cl
-; X86-NEXT:    cmovel %edx, %eax
+; X86-NEXT:    orb %dl, %cl
+; X86-NEXT:    cmovel %edi, %eax
 ; X86-NEXT:    movl $-2147483648, %edx # imm = 0x80000000
 ; X86-NEXT:    cmovel %esi, %edx
 ; X86-NEXT:    addl $4, %esp
@@ -368,65 +369,62 @@
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 32
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 28
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull %edi, %esi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    imull %ebx, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    imull %ecx, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    imull %ebp, %esi
-; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    imull %ebp, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    imull %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    addl %eax, %ebp
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %bl, %edi
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %bl, %esi
 ; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    xorl %edi, %edx
 ; X86-NEXT:    xorl %eax, %edi
@@ -437,10 +435,10 @@
 ; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    notl %ecx
 ; X86-NEXT:    cmovel (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    cmovel %ebx, %esi
+; X86-NEXT:    cmovel %ebp, %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
@@ -651,20 +649,20 @@
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    subl %esi, %ecx
 ; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    cmovnsl %edi, %esi
-; X86-NEXT:    cmovnsl %edx, %ebx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovsl %ecx, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    cmovnsl %esi, %ecx
-; X86-NEXT:    cmovnsl %ebx, %edx
+; X86-NEXT:    cmovsl %edi, %edx
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    setns {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    sets %bh
@@ -674,18 +672,18 @@
 ; X86-NEXT:    andb %bh, %bl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
 ; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovel %edx, %esi
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    cmovnel %esi, %eax
 ; X86-NEXT:    cmpl $-1, %ecx
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    sete %dl
-; X86-NEXT:    andb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    orb %cl, %dl
-; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    movl $-2147483648, %edx # imm = 0x80000000
-; X86-NEXT:    cmovel %esi, %edx
+; X86-NEXT:    sete %ch
+; X86-NEXT:    andb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -720,52 +718,51 @@
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    adcl %ebx, %edx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    subl %esi, %ebx
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %edi, %esi
-; X86-NEXT:    cmovnsl %edx, %ebx
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    sbbl $0, %ebp
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    cmovnsl %ebx, %ebp
+; X86-NEXT:    cmovnsl %edi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnsl %esi, %ecx
-; X86-NEXT:    cmovnsl %ebx, %edx
-; X86-NEXT:    shrdl $31, %edx, %eax
-; X86-NEXT:    shrdl $31, %ecx, %edx
+; X86-NEXT:    cmovnsl %ebp, %ecx
+; X86-NEXT:    cmovnsl %edx, %esi
+; X86-NEXT:    shrdl $31, %esi, %eax
+; X86-NEXT:    shrdl $31, %ecx, %esi
 ; X86-NEXT:    cmpl $1073741824, %ecx # imm = 0x40000000
-; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovll %edx, %esi
+; X86-NEXT:    movl $2147483647, %edi # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovll %esi, %edi
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpl $-1073741824, %ecx # imm = 0xC0000000
 ; X86-NEXT:    cmovll %edx, %eax
 ; X86-NEXT:    movl $-2147483648, %edx # imm = 0x80000000
-; X86-NEXT:    cmovgel %esi, %edx
+; X86-NEXT:    cmovgel %edi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index b93ecdf..c0d9287 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -42,10 +42,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,306783378,171798690,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,306783378,171798690,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -405,10 +405,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -492,10 +492,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -585,10 +585,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,858993458,268435454,858993458]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,858993458,268435454,858993458]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -675,10 +675,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,306783378,268435454,306783378]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,306783378,268435454,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -764,10 +764,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,306783378,268435454,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,306783378,268435454,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -992,10 +992,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1406,10 +1406,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,858993458]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,858993458]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1495,10 +1495,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,4294967295,268435454,306783378]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1584,10 +1584,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1812,10 +1812,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,4294967295,4294967295,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,4294967295,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1903,10 +1903,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,858993458]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,858993458]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1992,10 +1992,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,268435454,4294967295,306783378]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,268435454,4294967295,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -2081,10 +2081,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
index e26b26f..95587ee 100644
--- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -332,9 +332,8 @@
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE-NEXT:    addss %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test13:
diff --git a/llvm/test/CodeGen/X86/uadd_sat.ll b/llvm/test/CodeGen/X86/uadd_sat.ll
index 39ad8ab..cbecdef 100644
--- a/llvm/test/CodeGen/X86/uadd_sat.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat.ll
@@ -151,11 +151,12 @@
 ; X64-LABEL: vec:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; X64-NEXT:    paddd %xmm0, %xmm1
-; X64-NEXT:    pxor %xmm2, %xmm0
-; X64-NEXT:    pxor %xmm1, %xmm2
-; X64-NEXT:    pcmpgtd %xmm2, %xmm0
-; X64-NEXT:    por %xmm1, %xmm0
+; X64-NEXT:    movdqa %xmm0, %xmm3
+; X64-NEXT:    pxor %xmm2, %xmm3
+; X64-NEXT:    paddd %xmm1, %xmm0
+; X64-NEXT:    pxor %xmm0, %xmm2
+; X64-NEXT:    pcmpgtd %xmm2, %xmm3
+; X64-NEXT:    por %xmm3, %xmm0
 ; X64-NEXT:    retq
   %tmp = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %tmp
diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index b49b071..997205b6 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -522,21 +522,23 @@
 ; SSE2-LABEL: v2i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    paddd %xmm0, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT:    por %xmm3, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i32:
@@ -588,21 +590,23 @@
 ; SSE2-LABEL: v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    paddd %xmm0, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT:    por %xmm3, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i32:
@@ -654,33 +658,37 @@
 ; SSE2-LABEL: v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    paddd %xmm0, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
 ; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT:    por %xmm2, %xmm0
-; SSSE3-NEXT:    paddd %xmm1, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT:    por %xmm3, %xmm1
+; SSSE3-NEXT:    paddd %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT:    por %xmm5, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    paddd %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i32:
@@ -741,57 +749,65 @@
 ; SSE2-LABEL: v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    paddd %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm1, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm8, %xmm2
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT:    por %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm6, %xmm2
-; SSE2-NEXT:    paddd %xmm3, %xmm7
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    pxor %xmm7, %xmm8
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3
-; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    paddd %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    paddd %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v16i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    paddd %xmm0, %xmm4
-; SSSE3-NEXT:    pxor %xmm8, %xmm0
-; SSSE3-NEXT:    movdqa %xmm4, %xmm9
+; SSSE3-NEXT:    movdqa %xmm0, %xmm9
 ; SSSE3-NEXT:    pxor %xmm8, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSSE3-NEXT:    por %xmm4, %xmm0
-; SSSE3-NEXT:    paddd %xmm1, %xmm5
-; SSSE3-NEXT:    pxor %xmm8, %xmm1
-; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    paddd %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT:    por %xmm5, %xmm1
-; SSSE3-NEXT:    paddd %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm8, %xmm2
-; SSSE3-NEXT:    movdqa %xmm6, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm9
+; SSSE3-NEXT:    por %xmm9, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT:    por %xmm6, %xmm2
-; SSSE3-NEXT:    paddd %xmm3, %xmm7
-; SSSE3-NEXT:    pxor %xmm8, %xmm3
-; SSSE3-NEXT:    pxor %xmm7, %xmm8
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm3
-; SSSE3-NEXT:    por %xmm7, %xmm3
+; SSSE3-NEXT:    paddd %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm5
+; SSSE3-NEXT:    pxor %xmm8, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    paddd %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm8, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT:    por %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    paddd %xmm7, %xmm3
+; SSSE3-NEXT:    pxor %xmm3, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSSE3-NEXT:    por %xmm4, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v16i32:
@@ -863,16 +879,17 @@
 ; SSE-LABEL: v2i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT:    paddq %xmm0, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    paddq %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    retq
@@ -923,31 +940,33 @@
 ; SSE-LABEL: v4i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT:    paddq %xmm0, %xmm2
-; SSE-NEXT:    pxor %xmm4, %xmm0
-; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm0, %xmm5
 ; SSE-NEXT:    pxor %xmm4, %xmm5
-; SSE-NEXT:    movdqa %xmm0, %xmm6
-; SSE-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm4, %xmm2
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm6
 ; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm0, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT:    pand %xmm7, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm7, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
 ; SSE-NEXT:    por %xmm5, %xmm0
-; SSE-NEXT:    paddq %xmm1, %xmm3
-; SSE-NEXT:    pxor %xmm4, %xmm1
-; SSE-NEXT:    pxor %xmm3, %xmm4
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm1, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE-NEXT:    pand %xmm5, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT:    pxor %xmm4, %xmm2
+; SSE-NEXT:    paddq %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm4
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    por %xmm4, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i64:
@@ -1003,57 +1022,61 @@
 ; SSE-LABEL: v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT:    paddq %xmm0, %xmm4
-; SSE-NEXT:    pxor %xmm8, %xmm0
-; SSE-NEXT:    movdqa %xmm4, %xmm9
+; SSE-NEXT:    movdqa %xmm0, %xmm9
 ; SSE-NEXT:    pxor %xmm8, %xmm9
-; SSE-NEXT:    movdqa %xmm0, %xmm10
-; SSE-NEXT:    pcmpgtd %xmm9, %xmm10
+; SSE-NEXT:    paddq %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm8, %xmm4
+; SSE-NEXT:    movdqa %xmm9, %xmm10
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm10
 ; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm0, %xmm9
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm9, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
 ; SSE-NEXT:    pand %xmm11, %xmm9
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
 ; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    por %xmm9, %xmm0
-; SSE-NEXT:    paddq %xmm1, %xmm5
-; SSE-NEXT:    pxor %xmm8, %xmm1
-; SSE-NEXT:    movdqa %xmm5, %xmm4
-; SSE-NEXT:    pxor %xmm8, %xmm4
 ; SSE-NEXT:    movdqa %xmm1, %xmm9
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm9
-; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm1, %xmm4
+; SSE-NEXT:    pxor %xmm8, %xmm9
+; SSE-NEXT:    paddq %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm8, %xmm5
+; SSE-NEXT:    movdqa %xmm9, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm9, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE-NEXT:    pand %xmm10, %xmm5
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE-NEXT:    pand %xmm10, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3]
-; SSE-NEXT:    por %xmm5, %xmm1
 ; SSE-NEXT:    por %xmm4, %xmm1
-; SSE-NEXT:    paddq %xmm2, %xmm6
-; SSE-NEXT:    pxor %xmm8, %xmm2
-; SSE-NEXT:    movdqa %xmm6, %xmm4
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm4
 ; SSE-NEXT:    pxor %xmm8, %xmm4
+; SSE-NEXT:    paddq %xmm6, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT:    pxor %xmm8, %xmm5
+; SSE-NEXT:    movdqa %xmm4, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
 ; SSE-NEXT:    pand %xmm9, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE-NEXT:    por %xmm6, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE-NEXT:    por %xmm5, %xmm2
 ; SSE-NEXT:    por %xmm4, %xmm2
-; SSE-NEXT:    paddq %xmm3, %xmm7
-; SSE-NEXT:    pxor %xmm8, %xmm3
-; SSE-NEXT:    pxor %xmm7, %xmm8
 ; SSE-NEXT:    movdqa %xmm3, %xmm4
-; SSE-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm3, %xmm8
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
-; SSE-NEXT:    pand %xmm5, %xmm6
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE-NEXT:    por %xmm7, %xmm3
-; SSE-NEXT:    por %xmm6, %xmm3
+; SSE-NEXT:    pxor %xmm8, %xmm4
+; SSE-NEXT:    paddq %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm3, %xmm8
+; SSE-NEXT:    movdqa %xmm4, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm8, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
+; SSE-NEXT:    pand %xmm6, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE-NEXT:    por %xmm5, %xmm3
+; SSE-NEXT:    por %xmm4, %xmm3
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: v8i64:
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index 3afddbb..bc93d02 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -34,9 +34,10 @@
 ;
 ; X64-LABEL: test2:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    addl %esi, %edi
-; X64-NEXT:    leal (%rdi,%rdi), %eax
+; X64-NEXT:    leal (%rdi,%rsi), %eax
+; X64-NEXT:    addl    %eax, %eax
 ; X64-NEXT:    retq
 entry:
 	%tmp0 = add i32 %b, %a
diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index fa24779..fce98cc 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -45,19 +45,19 @@
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
@@ -306,31 +306,31 @@
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    shldl $1, %edx, %ecx
-; X86-NEXT:    shrdl $31, %edx, %eax
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    shrdl $31, %ecx, %eax
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index a61090a..155dab5 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -52,27 +52,26 @@
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    addl %ebp, %edx
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    shrdl $2, %eax, %ecx
 ; X86-NEXT:    shrdl $2, %edx, %eax
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index cb7a93a..a9be2a5 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -35,10 +35,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,306783378,171798691,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,306783378,171798691,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -189,10 +189,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,306783378,1,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -267,10 +267,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783379,306783379,2,306783379]
-; CHECK-SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783379,306783379,2,306783379]
+; CHECK-SSE41-NEXT:    pmaxud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -346,10 +346,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,306783378,1,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -423,10 +423,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993460,306783379,2,42949673]
-; CHECK-SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993460,306783379,2,42949673]
+; CHECK-SSE41-NEXT:    pmaxud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -505,10 +505,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,268435455,858993459]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,268435455,858993459]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -585,10 +585,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,306783378,268435455,306783378]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,306783378,268435455,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -664,10 +664,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,306783378,268435455,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,306783378,268435455,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -870,10 +870,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,306783378,4294967295,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -952,10 +952,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,1,858993459]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1032,10 +1032,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,306783378,1,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1111,10 +1111,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,306783378,1,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1193,10 +1193,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,1,268435455,858993459]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,1,268435455,858993459]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1272,10 +1272,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,1,268435455,306783378]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,1,268435455,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1351,10 +1351,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,1,268435455,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,1,268435455,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1469,10 +1469,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,1,4294967295,306783378]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,1,4294967295,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1548,10 +1548,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,1,4294967295,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,1,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1629,10 +1629,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,858993459]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,268435455,4294967295,858993459]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1708,10 +1708,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,268435455,4294967295,306783378]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,268435455,4294967295,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1787,10 +1787,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,42949672]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,268435455,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1868,10 +1868,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,1,268435455,4294967295]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,1,268435455,4294967295]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
@@ -1947,10 +1947,10 @@
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,1,268435455,4294967295]
-; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,1,268435455,4294967295]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
index 3b477766..008435b 100644
--- a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
@@ -723,11 +723,11 @@
 ; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpneqps %xmm3, %xmm4
-; SSE-32-NEXT:    cmpordps %xmm3, %xmm2
-; SSE-32-NEXT:    andps %xmm4, %xmm2
-; SSE-32-NEXT:    andps %xmm2, %xmm0
-; SSE-32-NEXT:    andnps %xmm1, %xmm2
-; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    cmpordps %xmm2, %xmm3
+; SSE-32-NEXT:    andps %xmm4, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -737,10 +737,10 @@
 ; SSE-64-NEXT:    movaps %xmm2, %xmm4
 ; SSE-64-NEXT:    cmpneqps %xmm3, %xmm4
 ; SSE-64-NEXT:    cmpordps %xmm3, %xmm2
-; SSE-64-NEXT:    andps %xmm4, %xmm2
-; SSE-64-NEXT:    andps %xmm2, %xmm0
-; SSE-64-NEXT:    andnps %xmm1, %xmm2
-; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    andps %xmm2, %xmm4
+; SSE-64-NEXT:    andps %xmm4, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm4
+; SSE-64-NEXT:    orps %xmm4, %xmm0
 ; SSE-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_v4f32_one_q:
@@ -916,11 +916,11 @@
 ; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpeqps %xmm3, %xmm4
-; SSE-32-NEXT:    cmpunordps %xmm3, %xmm2
-; SSE-32-NEXT:    orps %xmm4, %xmm2
-; SSE-32-NEXT:    andps %xmm2, %xmm0
-; SSE-32-NEXT:    andnps %xmm1, %xmm2
-; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    cmpunordps %xmm2, %xmm3
+; SSE-32-NEXT:    orps %xmm4, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -930,10 +930,10 @@
 ; SSE-64-NEXT:    movaps %xmm2, %xmm4
 ; SSE-64-NEXT:    cmpeqps %xmm3, %xmm4
 ; SSE-64-NEXT:    cmpunordps %xmm3, %xmm2
-; SSE-64-NEXT:    orps %xmm4, %xmm2
-; SSE-64-NEXT:    andps %xmm2, %xmm0
-; SSE-64-NEXT:    andnps %xmm1, %xmm2
-; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    orps %xmm2, %xmm4
+; SSE-64-NEXT:    andps %xmm4, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm4
+; SSE-64-NEXT:    orps %xmm4, %xmm0
 ; SSE-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_v4f32_ueq_q:
@@ -2383,11 +2383,11 @@
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movapd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpneqpd %xmm3, %xmm4
-; SSE-32-NEXT:    cmpordpd %xmm3, %xmm2
-; SSE-32-NEXT:    andpd %xmm4, %xmm2
-; SSE-32-NEXT:    andpd %xmm2, %xmm0
-; SSE-32-NEXT:    andnpd %xmm1, %xmm2
-; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    cmpordpd %xmm2, %xmm3
+; SSE-32-NEXT:    andpd %xmm4, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -2397,10 +2397,10 @@
 ; SSE-64-NEXT:    movapd %xmm2, %xmm4
 ; SSE-64-NEXT:    cmpneqpd %xmm3, %xmm4
 ; SSE-64-NEXT:    cmpordpd %xmm3, %xmm2
-; SSE-64-NEXT:    andpd %xmm4, %xmm2
-; SSE-64-NEXT:    andpd %xmm2, %xmm0
-; SSE-64-NEXT:    andnpd %xmm1, %xmm2
-; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    andpd %xmm2, %xmm4
+; SSE-64-NEXT:    andpd %xmm4, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm4
+; SSE-64-NEXT:    orpd %xmm4, %xmm0
 ; SSE-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_v2f64_one_q:
@@ -2576,11 +2576,11 @@
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movapd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpeqpd %xmm3, %xmm4
-; SSE-32-NEXT:    cmpunordpd %xmm3, %xmm2
-; SSE-32-NEXT:    orpd %xmm4, %xmm2
-; SSE-32-NEXT:    andpd %xmm2, %xmm0
-; SSE-32-NEXT:    andnpd %xmm1, %xmm2
-; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    cmpunordpd %xmm2, %xmm3
+; SSE-32-NEXT:    orpd %xmm4, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -2590,10 +2590,10 @@
 ; SSE-64-NEXT:    movapd %xmm2, %xmm4
 ; SSE-64-NEXT:    cmpeqpd %xmm3, %xmm4
 ; SSE-64-NEXT:    cmpunordpd %xmm3, %xmm2
-; SSE-64-NEXT:    orpd %xmm4, %xmm2
-; SSE-64-NEXT:    andpd %xmm2, %xmm0
-; SSE-64-NEXT:    andnpd %xmm1, %xmm2
-; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    orpd %xmm2, %xmm4
+; SSE-64-NEXT:    andpd %xmm4, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm4
+; SSE-64-NEXT:    orpd %xmm4, %xmm0
 ; SSE-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_v2f64_ueq_q:
@@ -3338,10 +3338,10 @@
 ; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
-; SSE-32-NEXT:    cmpeqps %xmm3, %xmm2
-; SSE-32-NEXT:    andps %xmm2, %xmm0
-; SSE-32-NEXT:    andnps %xmm1, %xmm2
-; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    cmpeqps %xmm2, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -3816,11 +3816,11 @@
 ; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpneqps %xmm3, %xmm4
-; SSE-32-NEXT:    cmpordps %xmm3, %xmm2
-; SSE-32-NEXT:    andps %xmm4, %xmm2
-; SSE-32-NEXT:    andps %xmm2, %xmm0
-; SSE-32-NEXT:    andnps %xmm1, %xmm2
-; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    cmpordps %xmm2, %xmm3
+; SSE-32-NEXT:    andps %xmm4, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -3832,10 +3832,10 @@
 ; SSE-64-NEXT:    movaps %xmm2, %xmm4
 ; SSE-64-NEXT:    cmpneqps %xmm3, %xmm4
 ; SSE-64-NEXT:    cmpordps %xmm3, %xmm2
-; SSE-64-NEXT:    andps %xmm4, %xmm2
-; SSE-64-NEXT:    andps %xmm2, %xmm0
-; SSE-64-NEXT:    andnps %xmm1, %xmm2
-; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    andps %xmm2, %xmm4
+; SSE-64-NEXT:    andps %xmm4, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm4
+; SSE-64-NEXT:    orps %xmm4, %xmm0
 ; SSE-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_v4f32_one_s:
@@ -3918,10 +3918,10 @@
 ; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
-; SSE-32-NEXT:    cmpordps %xmm3, %xmm2
-; SSE-32-NEXT:    andps %xmm2, %xmm0
-; SSE-32-NEXT:    andnps %xmm1, %xmm2
-; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    cmpordps %xmm2, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -4018,11 +4018,11 @@
 ; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpeqps %xmm3, %xmm4
-; SSE-32-NEXT:    cmpunordps %xmm3, %xmm2
-; SSE-32-NEXT:    orps %xmm4, %xmm2
-; SSE-32-NEXT:    andps %xmm2, %xmm0
-; SSE-32-NEXT:    andnps %xmm1, %xmm2
-; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    cmpunordps %xmm2, %xmm3
+; SSE-32-NEXT:    orps %xmm4, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -4034,10 +4034,10 @@
 ; SSE-64-NEXT:    movaps %xmm2, %xmm4
 ; SSE-64-NEXT:    cmpeqps %xmm3, %xmm4
 ; SSE-64-NEXT:    cmpunordps %xmm3, %xmm2
-; SSE-64-NEXT:    orps %xmm4, %xmm2
-; SSE-64-NEXT:    andps %xmm2, %xmm0
-; SSE-64-NEXT:    andnps %xmm1, %xmm2
-; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    orps %xmm2, %xmm4
+; SSE-64-NEXT:    andps %xmm4, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm4
+; SSE-64-NEXT:    orps %xmm4, %xmm0
 ; SSE-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_v4f32_ueq_s:
@@ -4498,10 +4498,10 @@
 ; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
-; SSE-32-NEXT:    cmpneqps %xmm3, %xmm2
-; SSE-32-NEXT:    andps %xmm2, %xmm0
-; SSE-32-NEXT:    andnps %xmm1, %xmm2
-; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    cmpneqps %xmm2, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -4596,10 +4596,10 @@
 ; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
-; SSE-32-NEXT:    cmpunordps %xmm3, %xmm2
-; SSE-32-NEXT:    andps %xmm2, %xmm0
-; SSE-32-NEXT:    andnps %xmm1, %xmm2
-; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    cmpunordps %xmm2, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -4694,10 +4694,10 @@
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movapd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
-; SSE-32-NEXT:    cmpeqpd %xmm3, %xmm2
-; SSE-32-NEXT:    andpd %xmm2, %xmm0
-; SSE-32-NEXT:    andnpd %xmm1, %xmm2
-; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    cmpeqpd %xmm2, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -5172,11 +5172,11 @@
 ; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
 ; SSE-32-NEXT:    movapd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpneqpd %xmm3, %xmm4
-; SSE-32-NEXT:    cmpordpd %xmm3, %xmm2
-; SSE-32-NEXT:    andpd %xmm4, %xmm2
-; SSE-32-NEXT:    andpd %xmm2, %xmm0
-; SSE-32-NEXT:    andnpd %xmm1, %xmm2
-; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    cmpordpd %xmm2, %xmm3
+; SSE-32-NEXT:    andpd %xmm4, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -5188,10 +5188,10 @@
 ; SSE-64-NEXT:    movapd %xmm2, %xmm4
 ; SSE-64-NEXT:    cmpneqpd %xmm3, %xmm4
 ; SSE-64-NEXT:    cmpordpd %xmm3, %xmm2
-; SSE-64-NEXT:    andpd %xmm4, %xmm2
-; SSE-64-NEXT:    andpd %xmm2, %xmm0
-; SSE-64-NEXT:    andnpd %xmm1, %xmm2
-; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    andpd %xmm2, %xmm4
+; SSE-64-NEXT:    andpd %xmm4, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm4
+; SSE-64-NEXT:    orpd %xmm4, %xmm0
 ; SSE-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_v2f64_one_s:
@@ -5274,10 +5274,10 @@
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movapd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
-; SSE-32-NEXT:    cmpordpd %xmm3, %xmm2
-; SSE-32-NEXT:    andpd %xmm2, %xmm0
-; SSE-32-NEXT:    andnpd %xmm1, %xmm2
-; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    cmpordpd %xmm2, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -5374,11 +5374,11 @@
 ; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
 ; SSE-32-NEXT:    movapd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpeqpd %xmm3, %xmm4
-; SSE-32-NEXT:    cmpunordpd %xmm3, %xmm2
-; SSE-32-NEXT:    orpd %xmm4, %xmm2
-; SSE-32-NEXT:    andpd %xmm2, %xmm0
-; SSE-32-NEXT:    andnpd %xmm1, %xmm2
-; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    cmpunordpd %xmm2, %xmm3
+; SSE-32-NEXT:    orpd %xmm4, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -5390,10 +5390,10 @@
 ; SSE-64-NEXT:    movapd %xmm2, %xmm4
 ; SSE-64-NEXT:    cmpeqpd %xmm3, %xmm4
 ; SSE-64-NEXT:    cmpunordpd %xmm3, %xmm2
-; SSE-64-NEXT:    orpd %xmm4, %xmm2
-; SSE-64-NEXT:    andpd %xmm2, %xmm0
-; SSE-64-NEXT:    andnpd %xmm1, %xmm2
-; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    orpd %xmm2, %xmm4
+; SSE-64-NEXT:    andpd %xmm4, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm4
+; SSE-64-NEXT:    orpd %xmm4, %xmm0
 ; SSE-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_v2f64_ueq_s:
@@ -5854,10 +5854,10 @@
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movapd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
-; SSE-32-NEXT:    cmpneqpd %xmm3, %xmm2
-; SSE-32-NEXT:    andpd %xmm2, %xmm0
-; SSE-32-NEXT:    andnpd %xmm1, %xmm2
-; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    cmpneqpd %xmm2, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
@@ -5952,10 +5952,10 @@
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movapd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
-; SSE-32-NEXT:    cmpunordpd %xmm3, %xmm2
-; SSE-32-NEXT:    andpd %xmm2, %xmm0
-; SSE-32-NEXT:    andnpd %xmm1, %xmm2
-; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    cmpunordpd %xmm2, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
 ; SSE-32-NEXT:    movl %ebp, %esp
 ; SSE-32-NEXT:    popl %ebp
 ; SSE-32-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/vec_ctbits.ll b/llvm/test/CodeGen/X86/vec_ctbits.ll
index e90611f..370f88d 100644
--- a/llvm/test/CodeGen/X86/vec_ctbits.ll
+++ b/llvm/test/CodeGen/X86/vec_ctbits.ll
@@ -37,40 +37,40 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrlq $1, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrlq $2, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlq $2, %xmm1
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrlq $4, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrlq $8, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlq $8, %xmm1
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrlq $16, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrlq $32, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlq $32, %xmm1
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrlw $1, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm1, %xmm2
+; CHECK-NEXT:    psrlw $2, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm1, %xmm0
 ; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    psubb %xmm0, %xmm1
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    psrlw $2, %xmm1
-; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    paddb %xmm2, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrlw $4, %xmm2
-; CHECK-NEXT:    paddb %xmm1, %xmm2
-; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    psadbw %xmm2, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    psadbw %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %c = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
   ret <2 x i64> %c
@@ -123,16 +123,15 @@
 ; CHECK-NEXT:    paddb %xmm2, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrlw $4, %xmm1
-; CHECK-NEXT:    paddb %xmm0, %xmm1
-; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; CHECK-NEXT:    psadbw %xmm0, %xmm2
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT:    psadbw %xmm0, %xmm1
-; CHECK-NEXT:    packuswb %xmm2, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    paddb %xmm1, %xmm0
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-NEXT:    psadbw %xmm1, %xmm2
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    psadbw %xmm1, %xmm0
+; CHECK-NEXT:    packuswb %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
   ret <2 x i32> %c
@@ -143,34 +142,34 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrld $1, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrld $2, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrld $2, %xmm1
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrld $4, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrld $8, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrld $8, %xmm1
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT:    pxor %xmm1, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    psrlw $1, %xmm0
-; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    psubb %xmm0, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    psrlw $2, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    paddb %xmm1, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    psrlw $4, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    pxor %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm1, %xmm2
+; CHECK-NEXT:    psrlw $2, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
 ; CHECK-NEXT:    paddb %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm1, %xmm0
 ; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
@@ -200,16 +199,15 @@
 ; CHECK-NEXT:    paddb %xmm2, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrlw $4, %xmm1
-; CHECK-NEXT:    paddb %xmm0, %xmm1
-; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; CHECK-NEXT:    psadbw %xmm0, %xmm2
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT:    psadbw %xmm0, %xmm1
-; CHECK-NEXT:    packuswb %xmm2, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    paddb %xmm1, %xmm0
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-NEXT:    psadbw %xmm1, %xmm2
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    psadbw %xmm1, %xmm0
+; CHECK-NEXT:    packuswb %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
   ret <2 x i32> %c
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index abb457c..51de689 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -1094,9 +1094,9 @@
 ; SSE2-NEXT:    psrlw $8, %xmm5
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    packuswb %xmm5, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    psrad $24, %xmm0
@@ -1135,9 +1135,9 @@
 ; SSSE3-NEXT:    psrlw $8, %xmm5
 ; SSSE3-NEXT:    psrlw $8, %xmm0
 ; SSSE3-NEXT:    packuswb %xmm5, %xmm0
-; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
 ; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT:    pxor %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSSE3-NEXT:    psrad $24, %xmm0
@@ -2385,11 +2385,11 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pmullw %xmm1, %xmm2
-; SSE2-NEXT:    pmulhuw %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pcmpeqw %xmm0, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqw %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm0, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE2-NEXT:    psrad $16, %xmm0
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
@@ -2402,11 +2402,11 @@
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSSE3-NEXT:    pmullw %xmm1, %xmm2
-; SSSE3-NEXT:    pmulhuw %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pcmpeqw %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT:    pxor %xmm3, %xmm1
+; SSSE3-NEXT:    pmulhuw %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    pcmpeqw %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT:    pxor %xmm0, %xmm1
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSSE3-NEXT:    psrad $16, %xmm0
 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
@@ -2419,11 +2419,11 @@
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    pmullw %xmm1, %xmm2
-; SSE41-NEXT:    pmulhuw %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pcmpeqw %xmm0, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm3, %xmm1
+; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm0, %xmm1
 ; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
 ; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
 ; SSE41-NEXT:    pslld $31, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index a9b872c..dbf5db2 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -25,13 +25,13 @@
 ; SSE-NEXT:    shlb $2, %al
 ; SSE-NEXT:    shrb $2, %dil
 ; SSE-NEXT:    andb $51, %dil
-; SSE-NEXT:    orb %al, %dil
-; SSE-NEXT:    movl %edi, %eax
-; SSE-NEXT:    andb $85, %al
-; SSE-NEXT:    addb %al, %al
-; SSE-NEXT:    shrb %dil
-; SSE-NEXT:    andb $85, %dil
 ; SSE-NEXT:    orb %dil, %al
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andb $85, %cl
+; SSE-NEXT:    addb %cl, %cl
+; SSE-NEXT:    shrb %al
+; SSE-NEXT:    andb $85, %al
+; SSE-NEXT:    orb %cl, %al
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_bitreverse_i8:
@@ -42,13 +42,13 @@
 ; AVX-NEXT:    shlb $2, %al
 ; AVX-NEXT:    shrb $2, %dil
 ; AVX-NEXT:    andb $51, %dil
-; AVX-NEXT:    orb %al, %dil
-; AVX-NEXT:    movl %edi, %eax
-; AVX-NEXT:    andb $85, %al
-; AVX-NEXT:    addb %al, %al
-; AVX-NEXT:    shrb %dil
-; AVX-NEXT:    andb $85, %dil
 ; AVX-NEXT:    orb %dil, %al
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    andb $85, %cl
+; AVX-NEXT:    addb %cl, %cl
+; AVX-NEXT:    shrb %al
+; AVX-NEXT:    andb $85, %al
+; AVX-NEXT:    orb %cl, %al
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: test_bitreverse_i8:
@@ -67,13 +67,13 @@
 ; GFNISSE-NEXT:    shlb $2, %al
 ; GFNISSE-NEXT:    shrb $2, %dil
 ; GFNISSE-NEXT:    andb $51, %dil
-; GFNISSE-NEXT:    orb %al, %dil
-; GFNISSE-NEXT:    movl %edi, %eax
-; GFNISSE-NEXT:    andb $85, %al
-; GFNISSE-NEXT:    addb %al, %al
-; GFNISSE-NEXT:    shrb %dil
-; GFNISSE-NEXT:    andb $85, %dil
 ; GFNISSE-NEXT:    orb %dil, %al
+; GFNISSE-NEXT:    movl %eax, %ecx
+; GFNISSE-NEXT:    andb $85, %cl
+; GFNISSE-NEXT:    addb %cl, %cl
+; GFNISSE-NEXT:    shrb %al
+; GFNISSE-NEXT:    andb $85, %al
+; GFNISSE-NEXT:    orb %cl, %al
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX-LABEL: test_bitreverse_i8:
@@ -84,13 +84,13 @@
 ; GFNIAVX-NEXT:    shlb $2, %al
 ; GFNIAVX-NEXT:    shrb $2, %dil
 ; GFNIAVX-NEXT:    andb $51, %dil
-; GFNIAVX-NEXT:    orb %al, %dil
-; GFNIAVX-NEXT:    movl %edi, %eax
-; GFNIAVX-NEXT:    andb $85, %al
-; GFNIAVX-NEXT:    addb %al, %al
-; GFNIAVX-NEXT:    shrb %dil
-; GFNIAVX-NEXT:    andb $85, %dil
 ; GFNIAVX-NEXT:    orb %dil, %al
+; GFNIAVX-NEXT:    movl %eax, %ecx
+; GFNIAVX-NEXT:    andb $85, %cl
+; GFNIAVX-NEXT:    addb %cl, %cl
+; GFNIAVX-NEXT:    shrb %al
+; GFNIAVX-NEXT:    andb $85, %al
+; GFNIAVX-NEXT:    orb %cl, %al
 ; GFNIAVX-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: test_bitreverse_i8:
@@ -101,13 +101,13 @@
 ; GFNIAVX2-NEXT:    shlb $2, %al
 ; GFNIAVX2-NEXT:    shrb $2, %dil
 ; GFNIAVX2-NEXT:    andb $51, %dil
-; GFNIAVX2-NEXT:    orb %al, %dil
-; GFNIAVX2-NEXT:    movl %edi, %eax
-; GFNIAVX2-NEXT:    andb $85, %al
-; GFNIAVX2-NEXT:    addb %al, %al
-; GFNIAVX2-NEXT:    shrb %dil
-; GFNIAVX2-NEXT:    andb $85, %dil
 ; GFNIAVX2-NEXT:    orb %dil, %al
+; GFNIAVX2-NEXT:    movl %eax, %ecx
+; GFNIAVX2-NEXT:    andb $85, %cl
+; GFNIAVX2-NEXT:    addb %cl, %cl
+; GFNIAVX2-NEXT:    shrb %al
+; GFNIAVX2-NEXT:    andb $85, %al
+; GFNIAVX2-NEXT:    orb %cl, %al
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512F-LABEL: test_bitreverse_i8:
@@ -118,13 +118,13 @@
 ; GFNIAVX512F-NEXT:    shlb $2, %al
 ; GFNIAVX512F-NEXT:    shrb $2, %dil
 ; GFNIAVX512F-NEXT:    andb $51, %dil
-; GFNIAVX512F-NEXT:    orb %al, %dil
-; GFNIAVX512F-NEXT:    movl %edi, %eax
-; GFNIAVX512F-NEXT:    andb $85, %al
-; GFNIAVX512F-NEXT:    addb %al, %al
-; GFNIAVX512F-NEXT:    shrb %dil
-; GFNIAVX512F-NEXT:    andb $85, %dil
 ; GFNIAVX512F-NEXT:    orb %dil, %al
+; GFNIAVX512F-NEXT:    movl %eax, %ecx
+; GFNIAVX512F-NEXT:    andb $85, %cl
+; GFNIAVX512F-NEXT:    addb %cl, %cl
+; GFNIAVX512F-NEXT:    shrb %al
+; GFNIAVX512F-NEXT:    andb $85, %al
+; GFNIAVX512F-NEXT:    orb %cl, %al
 ; GFNIAVX512F-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: test_bitreverse_i8:
@@ -135,13 +135,13 @@
 ; GFNIAVX512BW-NEXT:    shlb $2, %al
 ; GFNIAVX512BW-NEXT:    shrb $2, %dil
 ; GFNIAVX512BW-NEXT:    andb $51, %dil
-; GFNIAVX512BW-NEXT:    orb %al, %dil
-; GFNIAVX512BW-NEXT:    movl %edi, %eax
-; GFNIAVX512BW-NEXT:    andb $85, %al
-; GFNIAVX512BW-NEXT:    addb %al, %al
-; GFNIAVX512BW-NEXT:    shrb %dil
-; GFNIAVX512BW-NEXT:    andb $85, %dil
 ; GFNIAVX512BW-NEXT:    orb %dil, %al
+; GFNIAVX512BW-NEXT:    movl %eax, %ecx
+; GFNIAVX512BW-NEXT:    andb $85, %cl
+; GFNIAVX512BW-NEXT:    addb %cl, %cl
+; GFNIAVX512BW-NEXT:    shrb %al
+; GFNIAVX512BW-NEXT:    andb $85, %al
+; GFNIAVX512BW-NEXT:    orb %cl, %al
 ; GFNIAVX512BW-NEXT:    retq
   %b = call i8 @llvm.bitreverse.i8(i8 %a)
   ret i8 %b
diff --git a/llvm/test/CodeGen/X86/vector-ext-logic.ll b/llvm/test/CodeGen/X86/vector-ext-logic.ll
index 611c0d4..a26d7e9 100644
--- a/llvm/test/CodeGen/X86/vector-ext-logic.ll
+++ b/llvm/test/CodeGen/X86/vector-ext-logic.ll
@@ -5,13 +5,11 @@
 define <8 x i32> @zext_and_v8i32(<8 x i16> %x, <8 x i16> %y) {
 ; SSE2-LABEL: zext_and_v8i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_and_v8i32:
@@ -28,13 +26,11 @@
 define <8 x i32> @zext_or_v8i32(<8 x i16> %x, <8 x i16> %y) {
 ; SSE2-LABEL: zext_or_v8i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_or_v8i32:
@@ -51,13 +47,11 @@
 define <8 x i32> @zext_xor_v8i32(<8 x i16> %x, <8 x i16> %y) {
 ; SSE2-LABEL: zext_xor_v8i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_xor_v8i32:
@@ -298,14 +292,12 @@
 define <8 x i32> @bool_zext_or(<8 x i1> %x, <8 x i1> %y) {
 ; SSE2-LABEL: bool_zext_or:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: bool_zext_or:
@@ -323,14 +315,12 @@
 define <8 x i32> @bool_zext_xor(<8 x i1> %x, <8 x i1> %y) {
 ; SSE2-LABEL: bool_zext_xor:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: bool_zext_xor:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 0b4edf1..97a485b04 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -727,75 +727,74 @@
 ; SSE2-LABEL: var_funnnel_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    psrlw $1, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE2-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm7
 ; SSE2-NEXT:    pandn %xmm8, %xmm7
 ; SSE2-NEXT:    psllw $5, %xmm7
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtb %xmm7, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtb %xmm7, %xmm6
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm4
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm6
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; SSE2-NEXT:    por %xmm4, %xmm6
 ; SSE2-NEXT:    paddb %xmm7, %xmm7
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtb %xmm7, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm7, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm6
+; SSE2-NEXT:    pand %xmm1, %xmm6
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; SSE2-NEXT:    por %xmm4, %xmm6
 ; SSE2-NEXT:    paddb %xmm7, %xmm7
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtb %xmm7, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    psrlw $1, %xmm1
-; SSE2-NEXT:    pand %xmm6, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm7, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm6
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    pand %xmm8, %xmm2
 ; SSE2-NEXT:    psllw $5, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; SSE2-NEXT:    pandn %xmm0, %xmm5
 ; SSE2-NEXT:    psllw $4, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm5, %xmm0
 ; SSE2-NEXT:    paddb %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; SSE2-NEXT:    pandn %xmm0, %xmm5
 ; SSE2-NEXT:    psllw $2, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm5, %xmm0
 ; SSE2-NEXT:    paddb %xmm2, %xmm2
 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_funnnel_v16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pandn %xmm4, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pandn %xmm3, %xmm0
 ; SSE41-NEXT:    psllw $5, %xmm0
 ; SSE41-NEXT:    psrlw $1, %xmm1
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
@@ -814,27 +813,27 @@
 ; SSE41-NEXT:    pand %xmm5, %xmm6
 ; SSE41-NEXT:    paddb %xmm0, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
-; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
 ; SSE41-NEXT:    psllw $5, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    paddb %xmm3, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    paddb %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
 ; SSE41-NEXT:    psllw $4, %xmm5
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm3
 ; SSE41-NEXT:    psllw $2, %xmm3
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    paddb %xmm2, %xmm3
-; SSE41-NEXT:    paddb %xmm4, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    por %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm3
+; SSE41-NEXT:    paddb %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    por %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: var_funnnel_v16i8:
@@ -998,65 +997,65 @@
 ; X86-SSE2-LABEL: var_funnnel_v16i8:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    psrlw $1, %xmm1
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-SSE2-NEXT:    pand %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
-; X86-SSE2-NEXT:    pandn %xmm4, %xmm6
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm6
 ; X86-SSE2-NEXT:    psllw $5, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
-; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm5, %xmm5
+; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm7
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm7
 ; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    pand %xmm3, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    por %xmm7, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5
+; X86-SSE2-NEXT:    por %xmm7, %xmm5
 ; X86-SSE2-NEXT:    paddb %xmm6, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
-; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm7
-; X86-SSE2-NEXT:    psrlw $2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm3, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    por %xmm7, %xmm1
-; X86-SSE2-NEXT:    paddb %xmm6, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
-; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
-; X86-SSE2-NEXT:    pand %xmm5, %xmm3
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm6
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    por %xmm6, %xmm3
-; X86-SSE2-NEXT:    pand %xmm4, %xmm2
-; X86-SSE2-NEXT:    psllw $5, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm7
+; X86-SSE2-NEXT:    psrlw $2, %xmm5
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5
+; X86-SSE2-NEXT:    por %xmm7, %xmm5
+; X86-SSE2-NEXT:    paddb %xmm6, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    pand %xmm4, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm6
+; X86-SSE2-NEXT:    psrlw $1, %xmm5
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    por %xmm6, %xmm1
+; X86-SSE2-NEXT:    pand %xmm3, %xmm2
+; X86-SSE2-NEXT:    psllw $5, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm4
 ; X86-SSE2-NEXT:    psllw $4, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    por %xmm4, %xmm0
 ; X86-SSE2-NEXT:    paddb %xmm2, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm4
 ; X86-SSE2-NEXT:    psllw $2, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    por %xmm4, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
 ; X86-SSE2-NEXT:    paddb %xmm2, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm3, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
 ; X86-SSE2-NEXT:    paddb %xmm0, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
 ; X86-SSE2-NEXT:    retl
   %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index d7408e9..baab6cb 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -426,56 +426,56 @@
 define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
 ; SSE2-LABEL: var_funnnel_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    psllw $12, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psraw $15, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    psllw $12, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    psraw $15, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; SSE2-NEXT:    pandn %xmm1, %xmm5
 ; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    paddw %xmm3, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psraw $15, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    paddw %xmm3, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psraw $15, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    paddw %xmm3, %xmm3
-; SSE2-NEXT:    psraw $15, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    psrlw $4, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    psrlw $2, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    psraw $15, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm3
 ; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pslld $23, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT:    paddd %xmm5, %xmm1
-; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT:    pslld $16, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    paddd %xmm5, %xmm4
+; SSE2-NEXT:    cvttps2dq %xmm4, %xmm4
+; SSE2-NEXT:    pslld $16, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    pslld $23, %xmm2
 ; SSE2-NEXT:    paddd %xmm5, %xmm2
 ; SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
 ; SSE2-NEXT:    pslld $16, %xmm2
 ; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    packssdw %xmm1, %xmm2
+; SSE2-NEXT:    packssdw %xmm4, %xmm2
 ; SSE2-NEXT:    psllw $1, %xmm0
 ; SSE2-NEXT:    pmullw %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -669,56 +669,56 @@
 ;
 ; X86-SSE2-LABEL: var_funnnel_v8i16:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    psllw $12, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT:    psraw $15, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    psllw $12, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT:    psraw $15, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm5
 ; X86-SSE2-NEXT:    psrlw $8, %xmm1
-; X86-SSE2-NEXT:    pand %xmm4, %xmm1
-; X86-SSE2-NEXT:    por %xmm5, %xmm1
-; X86-SSE2-NEXT:    paddw %xmm3, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT:    psraw $15, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm5
-; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    pand %xmm4, %xmm1
-; X86-SSE2-NEXT:    por %xmm5, %xmm1
-; X86-SSE2-NEXT:    paddw %xmm3, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT:    psraw $15, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm5
-; X86-SSE2-NEXT:    psrlw $2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm4, %xmm1
-; X86-SSE2-NEXT:    por %xmm5, %xmm1
-; X86-SSE2-NEXT:    paddw %xmm3, %xmm3
-; X86-SSE2-NEXT:    psraw $15, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm5, %xmm3
+; X86-SSE2-NEXT:    paddw %xmm4, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    psraw $15, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X86-SSE2-NEXT:    psrlw $4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm5, %xmm3
+; X86-SSE2-NEXT:    paddw %xmm4, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    psraw $15, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X86-SSE2-NEXT:    psrlw $2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm5, %xmm3
+; X86-SSE2-NEXT:    paddw %xmm4, %xmm4
+; X86-SSE2-NEXT:    psraw $15, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm1
+; X86-SSE2-NEXT:    psrlw $1, %xmm3
+; X86-SSE2-NEXT:    pand %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT:    pslld $23, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT:    pslld $23, %xmm4
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT:    paddd %xmm5, %xmm1
-; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT:    pslld $16, %xmm1
-; X86-SSE2-NEXT:    psrad $16, %xmm1
+; X86-SSE2-NEXT:    paddd %xmm5, %xmm4
+; X86-SSE2-NEXT:    cvttps2dq %xmm4, %xmm4
+; X86-SSE2-NEXT:    pslld $16, %xmm4
+; X86-SSE2-NEXT:    psrad $16, %xmm4
 ; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
 ; X86-SSE2-NEXT:    pslld $23, %xmm2
 ; X86-SSE2-NEXT:    paddd %xmm5, %xmm2
 ; X86-SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
 ; X86-SSE2-NEXT:    pslld $16, %xmm2
 ; X86-SSE2-NEXT:    psrad $16, %xmm2
-; X86-SSE2-NEXT:    packssdw %xmm1, %xmm2
+; X86-SSE2-NEXT:    packssdw %xmm4, %xmm2
 ; X86-SSE2-NEXT:    psllw $1, %xmm0
 ; X86-SSE2-NEXT:    pmullw %xmm2, %xmm0
-; X86-SSE2-NEXT:    por %xmm4, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm3, %xmm0
 ; X86-SSE2-NEXT:    retl
   %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
@@ -738,24 +738,24 @@
 ; SSE2-NEXT:    movdqa %xmm4, %xmm7
 ; SSE2-NEXT:    pandn %xmm1, %xmm7
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT:    por %xmm7, %xmm4
 ; SSE2-NEXT:    paddb %xmm6, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtb %xmm6, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    pandn %xmm1, %xmm7
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    por %xmm7, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    pandn %xmm4, %xmm7
+; SSE2-NEXT:    psrlw $2, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT:    por %xmm7, %xmm4
 ; SSE2-NEXT:    paddb %xmm6, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtb %xmm6, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    pandn %xmm1, %xmm6
-; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pandn %xmm4, %xmm6
+; SSE2-NEXT:    psrlw $1, %xmm4
 ; SSE2-NEXT:    pand %xmm1, %xmm4
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
 ; SSE2-NEXT:    por %xmm6, %xmm4
@@ -999,24 +999,24 @@
 ; X86-SSE2-NEXT:    movdqa %xmm4, %xmm7
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm7
 ; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    pand %xmm4, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    por %xmm7, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm4
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
+; X86-SSE2-NEXT:    por %xmm7, %xmm4
 ; X86-SSE2-NEXT:    paddb %xmm6, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
-; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm7
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm7
-; X86-SSE2-NEXT:    psrlw $2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm4, %xmm1
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    por %xmm7, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm7
+; X86-SSE2-NEXT:    psrlw $2, %xmm4
+; X86-SSE2-NEXT:    pand %xmm1, %xmm4
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
+; X86-SSE2-NEXT:    por %xmm7, %xmm4
 ; X86-SSE2-NEXT:    paddb %xmm6, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
-; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm6
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm6
-; X86-SSE2-NEXT:    psrlw $1, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm6
+; X86-SSE2-NEXT:    psrlw $1, %xmm4
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
 ; X86-SSE2-NEXT:    por %xmm6, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 9841a42..53e4508 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -78,41 +78,39 @@
 define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
 ; SSE2-LABEL: test_div7_4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
+; SSE2-NEXT:    pmuludq %xmm1, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    psubd %xmm3, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $31, %xmm0
-; SSE2-NEXT:    psrad $2, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psubd %xmm3, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $31, %xmm1
+; SSE2-NEXT:    psrad $2, %xmm0
+; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test_div7_4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT:    pmuldq %xmm1, %xmm2
-; SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrld $31, %xmm0
-; SSE41-NEXT:    psrad $2, %xmm1
-; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT:    pmuldq %xmm2, %xmm1
+; SSE41-NEXT:    pmuldq %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; SSE41-NEXT:    paddd %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $31, %xmm1
+; SSE41-NEXT:    psrad $2, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test_div7_4i32:
@@ -170,17 +168,16 @@
 define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; SSE-LABEL: test_div7_16i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
 ; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
 ; SSE-NEXT:    pmulhw %xmm3, %xmm2
 ; SSE-NEXT:    psrlw $8, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    pmulhw %xmm3, %xmm0
-; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    pmulhw %xmm3, %xmm1
+; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    packuswb %xmm2, %xmm1
 ; SSE-NEXT:    paddb %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    psrlw $2, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
index 7ae52d9..0cb2b01 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -18,80 +18,80 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlq $1, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlq $2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $2, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlq $4, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlq $8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $8, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlq $16, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $32, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrlw $4, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv2i64:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlq $1, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlq $2, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlq $2, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlq $4, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlq $8, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlq $8, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlq $16, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlq $32, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlq $32, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlw $2, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddb %xmm2, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    psrlw $4, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm2, %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv2i64:
@@ -268,80 +268,80 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlq $1, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlq $2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $2, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlq $4, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlq $8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $8, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlq $16, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $32, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrlw $4, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv2i64u:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlq $1, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlq $2, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlq $2, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlq $4, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlq $8, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlq $8, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlq $16, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlq $32, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlq $32, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlw $2, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddb %xmm2, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    psrlw $4, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    psadbw %xmm2, %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv2i64u:
@@ -518,34 +518,34 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $2, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrld $4, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $8, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -560,34 +560,34 @@
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrld $1, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrld $2, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrld $2, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrld $4, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrld $8, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrld $8, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrld $16, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT:    pxor %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrlw $2, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    pxor %xmm1, %xmm1
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
@@ -744,34 +744,34 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $2, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrld $4, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $8, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -786,34 +786,34 @@
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrld $1, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrld $2, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrld $2, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrld $4, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrld $8, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrld $8, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrld $16, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT:    pxor %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrlw $2, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    pxor %xmm1, %xmm1
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
@@ -970,35 +970,35 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrlw $4, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1006,35 +1006,35 @@
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $2, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlw $8, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $8, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlw $2, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddb %xmm2, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    psrlw $4, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
+; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    retq
 ;
@@ -1160,35 +1160,35 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrlw $4, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1196,35 +1196,35 @@
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $2, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlw $8, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $8, %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlw $2, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddb %xmm2, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    psrlw $4, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
+; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    retq
 ;
@@ -1351,31 +1351,31 @@
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddb %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    paddb %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1384,31 +1384,31 @@
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlw $2, %xmm0
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $2, %xmm1
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
 ; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT:    pxor %xmm1, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrlw $2, %xmm3
-; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddb %xmm1, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSE3-NEXT:    pand %xmm1, %xmm3
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    paddb %xmm3, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    pand %xmm2, %xmm0
 ; SSE3-NEXT:    retq
 ;
@@ -1501,31 +1501,31 @@
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $2, %xmm0
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddb %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    paddb %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1534,31 +1534,31 @@
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlw $2, %xmm0
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $2, %xmm1
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE3-NEXT:    por %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
 ; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE3-NEXT:    pand %xmm2, %xmm1
-; SSE3-NEXT:    por %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT:    pxor %xmm1, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrlw $2, %xmm3
-; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddb %xmm1, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSE3-NEXT:    pand %xmm1, %xmm3
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    paddb %xmm3, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    pand %xmm2, %xmm0
 ; SSE3-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll
index e903c09..a1b277e 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll
@@ -8,34 +8,34 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrld $1, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrld $2, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrld $2, %xmm1
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrld $4, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrld $8, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrld $8, %xmm1
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT:    pxor %xmm1, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    psrlw $1, %xmm0
-; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    psubb %xmm0, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    psrlw $2, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    paddb %xmm1, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    psrlw $4, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    pxor %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm1, %xmm2
+; CHECK-NEXT:    psrlw $2, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
 ; CHECK-NEXT:    paddb %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm1, %xmm0
 ; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
@@ -69,16 +69,15 @@
 ; CHECK-NEXT:    paddb %xmm2, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-NEXT:    psrlw $4, %xmm1
-; CHECK-NEXT:    paddb %xmm0, %xmm1
-; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; CHECK-NEXT:    psadbw %xmm0, %xmm2
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT:    psadbw %xmm0, %xmm1
-; CHECK-NEXT:    packuswb %xmm2, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    paddb %xmm1, %xmm0
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-NEXT:    psadbw %xmm1, %xmm2
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    psadbw %xmm1, %xmm0
+; CHECK-NEXT:    packuswb %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %v2 = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %v1, i1 true)
   ret <2 x i32> %v2
diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
index 1d4b457..01190f5 100644
--- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
@@ -153,9 +153,9 @@
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
 ; SSE-NEXT:    mulpd %xmm2, %xmm2
 ; SSE-NEXT:    mulpd %xmm1, %xmm1
-; SSE-NEXT:    addpd %xmm2, %xmm1
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    addpd %xmm1, %xmm2
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT:    movapd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: fmul_v2f64:
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
index 4a9439c..f95178f 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
@@ -1669,10 +1669,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1692,10 +1692,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1988,10 +1988,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2011,10 +2011,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2307,10 +2307,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2330,10 +2330,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2626,10 +2626,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2649,10 +2649,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2945,10 +2945,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2968,10 +2968,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3264,10 +3264,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3287,10 +3287,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3583,10 +3583,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3606,10 +3606,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3902,10 +3902,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3925,10 +3925,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4221,10 +4221,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4244,10 +4244,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4540,10 +4540,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4563,10 +4563,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4859,10 +4859,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4882,10 +4882,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -5178,10 +5178,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -5201,10 +5201,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -5497,10 +5497,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -5520,10 +5520,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -5960,17 +5960,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_2_v4i32:
@@ -5987,17 +5986,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_2_v4i32:
@@ -6340,17 +6338,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_3_v4i32:
@@ -6367,17 +6364,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_3_v4i32:
@@ -6720,17 +6716,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_4_v4i32:
@@ -6747,17 +6742,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_4_v4i32:
@@ -7100,17 +7094,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_5_v4i32:
@@ -7127,17 +7120,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_5_v4i32:
@@ -7480,17 +7472,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_6_v4i32:
@@ -7507,17 +7498,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_6_v4i32:
@@ -7860,17 +7850,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_7_v4i32:
@@ -7887,17 +7876,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_7_v4i32:
@@ -8240,17 +8228,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_8_v4i32:
@@ -8267,17 +8254,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_8_v4i32:
@@ -8620,17 +8606,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_9_v4i32:
@@ -8647,17 +8632,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_9_v4i32:
@@ -9000,17 +8984,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_10_v4i32:
@@ -9027,17 +9010,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_10_v4i32:
@@ -9380,17 +9362,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_11_v4i32:
@@ -9407,17 +9388,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_11_v4i32:
@@ -9760,17 +9740,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_12_v4i32:
@@ -9787,17 +9766,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_12_v4i32:
@@ -10140,17 +10118,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_13_v4i32:
@@ -10167,17 +10144,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_13_v4i32:
@@ -10520,17 +10496,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_14_v4i32:
@@ -10547,17 +10522,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_14_v4i32:
@@ -10900,17 +10874,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_15_v4i32:
@@ -10927,17 +10900,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_15_v4i32:
@@ -11280,17 +11252,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_16_v4i32:
@@ -11307,17 +11278,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_16_v4i32:
@@ -11660,17 +11630,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_17_v4i32:
@@ -11687,17 +11656,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_17_v4i32:
@@ -12040,17 +12008,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_18_v4i32:
@@ -12067,17 +12034,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_18_v4i32:
@@ -12420,17 +12386,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_19_v4i32:
@@ -12447,17 +12412,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_19_v4i32:
@@ -12800,17 +12764,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_20_v4i32:
@@ -12827,17 +12790,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_20_v4i32:
@@ -13180,17 +13142,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_21_v4i32:
@@ -13207,17 +13168,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_21_v4i32:
@@ -13560,17 +13520,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_22_v4i32:
@@ -13587,17 +13546,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_22_v4i32:
@@ -13940,17 +13898,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_23_v4i32:
@@ -13967,17 +13924,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_23_v4i32:
@@ -14320,17 +14276,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_24_v4i32:
@@ -14347,17 +14302,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_24_v4i32:
@@ -14700,17 +14654,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_25_v4i32:
@@ -14727,17 +14680,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_25_v4i32:
@@ -15080,17 +15032,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_26_v4i32:
@@ -15107,17 +15058,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_26_v4i32:
@@ -15460,17 +15410,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_27_v4i32:
@@ -15487,17 +15436,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_27_v4i32:
@@ -15840,17 +15788,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_28_v4i32:
@@ -15867,17 +15814,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_28_v4i32:
@@ -16220,17 +16166,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_29_v4i32:
@@ -16247,17 +16192,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_29_v4i32:
@@ -16600,17 +16544,16 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ugt_30_v4i32:
@@ -16627,17 +16570,16 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ugt_30_v4i32:
@@ -16971,11 +16913,11 @@
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    paddq %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -16984,11 +16926,11 @@
 ; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    paddq %xmm1, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    pxor %xmm3, %xmm3
-; SSE3-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE3-NEXT:    pand %xmm3, %xmm0
+; SSE3-NEXT:    pand %xmm2, %xmm0
+; SSE3-NEXT:    pxor %xmm2, %xmm2
+; SSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; SSE3-NEXT:    pand %xmm2, %xmm0
 ; SSE3-NEXT:    pxor %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
@@ -16997,11 +16939,11 @@
 ; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSSE3-NEXT:    paddq %xmm1, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -17084,33 +17026,33 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE2-NEXT:    paddq %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ult_2_v2i64:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE3-NEXT:    paddq %xmm0, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    pxor %xmm2, %xmm2
-; SSE3-NEXT:    pcmpeqd %xmm1, %xmm2
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE3-NEXT:    pand %xmm2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ult_2_v2i64:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSSE3-NEXT:    paddq %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ult_2_v2i64:
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
index 70aae67..ac7d4d6 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
@@ -158,16 +158,15 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv4i32:
@@ -184,16 +183,15 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv4i32:
@@ -329,10 +327,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    retq
@@ -351,10 +349,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    retq
@@ -804,52 +802,55 @@
 ; SSE2-LABEL: ne_1_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT:    pand %xmm2, %xmm3
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    paddq %xmm2, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    paddq %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
-; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: ne_1_v2i64:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE3-NEXT:    pand %xmm2, %xmm3
 ; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSE3-NEXT:    paddq %xmm2, %xmm3
-; SSE3-NEXT:    pand %xmm0, %xmm3
+; SSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSE3-NEXT:    paddq %xmm2, %xmm4
+; SSE3-NEXT:    pand %xmm4, %xmm0
 ; SSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
-; SSE3-NEXT:    pand %xmm0, %xmm4
-; SSE3-NEXT:    pcmpeqd %xmm1, %xmm3
-; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE3-NEXT:    pand %xmm3, %xmm0
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    pxor %xmm2, %xmm0
-; SSE3-NEXT:    por %xmm4, %xmm0
+; SSE3-NEXT:    por %xmm3, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: ne_1_v2i64:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm2, %xmm3
 ; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    paddq %xmm2, %xmm3
-; SSSE3-NEXT:    pand %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    paddq %xmm2, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm0
 ; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm0, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    por %xmm3, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: ne_1_v2i64:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
index e8738ec..254d8ed 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
@@ -321,9 +321,9 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    addss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -331,8 +331,8 @@
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    addss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -375,9 +375,9 @@
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    addss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -386,8 +386,8 @@
 ; SSE41-NEXT:    addps %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    addss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -444,9 +444,9 @@
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    addss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -457,8 +457,8 @@
 ; SSE41-NEXT:    addps %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    addss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -562,9 +562,9 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    addss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -572,8 +572,8 @@
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    addss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -616,9 +616,9 @@
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    addss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -627,8 +627,8 @@
 ; SSE41-NEXT:    addps %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    addss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -685,9 +685,9 @@
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    addss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -698,8 +698,8 @@
 ; SSE41-NEXT:    addps %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    addss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1126,9 +1126,9 @@
 ; SSE-NEXT:    addpd %xmm7, %xmm3
 ; SSE-NEXT:    addpd %xmm5, %xmm1
 ; SSE-NEXT:    addpd %xmm3, %xmm1
-; SSE-NEXT:    addpd %xmm0, %xmm1
-; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE-NEXT:    addsd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
@@ -1330,9 +1330,9 @@
 ; SSE-NEXT:    addpd %xmm7, %xmm3
 ; SSE-NEXT:    addpd %xmm5, %xmm1
 ; SSE-NEXT:    addpd %xmm3, %xmm1
-; SSE-NEXT:    addpd %xmm0, %xmm1
-; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE-NEXT:    addsd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
index e0b30d1..7b17b34 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
@@ -233,9 +233,9 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    mulps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    mulps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -243,8 +243,8 @@
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    mulps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    mulps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -273,9 +273,9 @@
 ; SSE2-NEXT:    mulps %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    mulps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    mulps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -284,8 +284,8 @@
 ; SSE41-NEXT:    mulps %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    mulps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    mulps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -322,9 +322,9 @@
 ; SSE2-NEXT:    mulps %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    mulps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    mulps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -335,8 +335,8 @@
 ; SSE41-NEXT:    mulps %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    mulps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    mulps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -406,9 +406,9 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    mulps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    mulps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -416,8 +416,8 @@
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    mulps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    mulps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -446,9 +446,9 @@
 ; SSE2-NEXT:    mulps %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    mulps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    mulps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -457,8 +457,8 @@
 ; SSE41-NEXT:    mulps %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    mulps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    mulps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -495,9 +495,9 @@
 ; SSE2-NEXT:    mulps %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    mulps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    mulps %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
 ; SSE2-NEXT:    mulss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -508,8 +508,8 @@
 ; SSE41-NEXT:    mulps %xmm1, %xmm0
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    mulps %xmm0, %xmm1
-; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    mulps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    mulss %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -788,9 +788,9 @@
 ; SSE-NEXT:    mulpd %xmm7, %xmm3
 ; SSE-NEXT:    mulpd %xmm5, %xmm1
 ; SSE-NEXT:    mulpd %xmm3, %xmm1
-; SSE-NEXT:    mulpd %xmm0, %xmm1
-; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    mulpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE-NEXT:    mulsd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
@@ -922,9 +922,9 @@
 ; SSE-NEXT:    mulpd %xmm7, %xmm3
 ; SSE-NEXT:    mulpd %xmm5, %xmm1
 ; SSE-NEXT:    mulpd %xmm3, %xmm1
-; SSE-NEXT:    mulpd %xmm0, %xmm1
-; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    mulpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE-NEXT:    mulsd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 0244beb..3dcc2507ac6 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -257,33 +257,31 @@
 ;
 ; SSE41-LABEL: var_shift_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    psllw $12, %xmm0
-; SSE41-NEXT:    psllw $4, %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    paddw %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psraw $8, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psraw $4, %xmm2
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psraw $2, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psraw $1, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $8, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $4, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $2, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $1, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v8i16:
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index aebc0eb0..1ba47a7 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -161,33 +161,31 @@
 ;
 ; SSE41-LABEL: var_shift_v4i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    psllw $12, %xmm0
-; SSE41-NEXT:    psllw $4, %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    paddw %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psraw $8, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psraw $4, %xmm2
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psraw $2, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psraw $1, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $8, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $4, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $2, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $1, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v4i16:
@@ -334,33 +332,31 @@
 ;
 ; SSE41-LABEL: var_shift_v2i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    psllw $12, %xmm0
-; SSE41-NEXT:    psllw $4, %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    paddw %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psraw $8, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psraw $4, %xmm2
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psraw $2, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psraw $1, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $8, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $4, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $2, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $1, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v2i16:
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 792cf6a..cb9e134 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -227,33 +227,31 @@
 ;
 ; SSE41-LABEL: var_shift_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    psllw $12, %xmm0
-; SSE41-NEXT:    psllw $4, %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    paddw %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psrlw $8, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $4, %xmm2
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $2, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $1, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $4, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $2, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $1, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v8i16:
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index ac5f158..4cc16c5 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -161,33 +161,31 @@
 ;
 ; SSE41-LABEL: var_shift_v4i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    psllw $12, %xmm0
-; SSE41-NEXT:    psllw $4, %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    paddw %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psrlw $8, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $4, %xmm2
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $2, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $1, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $4, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $2, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $1, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v4i16:
@@ -334,33 +332,31 @@
 ;
 ; SSE41-LABEL: var_shift_v2i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    psllw $12, %xmm0
-; SSE41-NEXT:    psllw $4, %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    paddw %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psrlw $8, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $4, %xmm2
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $2, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $1, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $4, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $2, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $1, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v2i16:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index 442e7c4..b2d813d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -23,18 +23,18 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
-; X86-NEXT:    vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
+; X86-NEXT:    vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_32i16_identity_mask:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
-; X64-NEXT:    vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT:    vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
+; X64-NEXT:    vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 %m)
   %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 %m)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index c358250..be65eff 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -18,18 +18,18 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpermi2w %ymm0, %ymm0, %ymm1 {%k1} {z}
-; X86-NEXT:    vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X86-NEXT:    vpermi2w %ymm1, %ymm1, %ymm0 {%k1} {z}
+; X86-NEXT:    vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X86-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT:    vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16i16_identity_mask:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpermi2w %ymm0, %ymm0, %ymm1 {%k1} {z}
-; X64-NEXT:    vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-NEXT:    vpermi2w %ymm1, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT:    vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-NEXT:    vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
 ; X64-NEXT:    retq
   %res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %x0, <16 x i16> %x1, i16 %m)
   %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 30, i16 13, i16 28, i16 11, i16 26, i16 9, i16 24, i16 7, i16 22, i16 5, i16 20, i16 3, i16 18, i16 1, i16 16>, <16 x i16> %res0, <16 x i16> %res0, i16 %m)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index c9928d1..77e756b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -154,9 +154,9 @@
 ; X86-AVX512F-NEXT:    vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
 ; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k1
-; X86-AVX512F-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X86-AVX512F-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT:    vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-AVX512F-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
@@ -164,27 +164,27 @@
 ; X86-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
 ; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k1
-; X86-AVX512BW-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X86-AVX512BW-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512BW-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-AVX512BW-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512BW-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X64-AVX512F-NEXT:    kmovw %edi, %k1
-; X64-AVX512F-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
-; X64-AVX512F-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT:    vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; X64-AVX512F-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512F-NEXT:    retq
 ;
 ; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X64-AVX512BW:       # %bb.0:
 ; X64-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X64-AVX512BW-NEXT:    kmovd %edi, %k1
-; X64-AVX512BW-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
-; X64-AVX512BW-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; X64-AVX512BW-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512BW-NEXT:    retq
   %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
   %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
@@ -258,9 +258,9 @@
 ; X86-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
 ; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k1
-; X86-AVX512F-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X86-AVX512F-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X86-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask:
@@ -268,27 +268,27 @@
 ; X86-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
 ; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k1
-; X86-AVX512BW-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X86-AVX512BW-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512BW-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X64-AVX512F-NEXT:    kmovw %edi, %k1
-; X64-AVX512F-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
-; X64-AVX512F-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; X64-AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512F-NEXT:    retq
 ;
 ; X64-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask:
 ; X64-AVX512BW:       # %bb.0:
 ; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X64-AVX512BW-NEXT:    kmovd %edi, %k1
-; X64-AVX512BW-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
-; X64-AVX512BW-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; X64-AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512BW-NEXT:    retq
   %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m)
   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m)
@@ -308,27 +308,27 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X86-NEXT:    vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_identity_mask:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-AVX512F-NEXT:    kmovw %edi, %k1
-; X64-AVX512F-NEXT:    vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-AVX512F-NEXT:    vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-AVX512F-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512F-NEXT:    retq
 ;
 ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_identity_mask:
 ; X64-AVX512BW:       # %bb.0:
 ; X64-AVX512BW-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-AVX512BW-NEXT:    kmovd %edi, %k1
-; X64-AVX512BW-NEXT:    vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-AVX512BW-NEXT:    vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT:    vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-AVX512BW-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512BW-NEXT:    retq
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m)
   %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m)
@@ -597,27 +597,27 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X86-NEXT:    vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT:    vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: combine_vpermt2var_16i32_identity_mask:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-AVX512F-NEXT:    kmovw %edi, %k1
-; X64-AVX512F-NEXT:    vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-AVX512F-NEXT:    vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512F-NEXT:    retq
 ;
 ; X64-AVX512BW-LABEL: combine_vpermt2var_16i32_identity_mask:
 ; X64-AVX512BW:       # %bb.0:
 ; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-AVX512BW-NEXT:    kmovd %edi, %k1
-; X64-AVX512BW-NEXT:    vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-AVX512BW-NEXT:    vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X64-AVX512BW-NEXT:    retq
   %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
index 86c206c..78ae0d2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
@@ -31,18 +31,18 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpermi2b %xmm0, %xmm0, %xmm1 {%k1} {z}
-; X86-NEXT:    vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X86-NEXT:    vpermi2b %xmm1, %xmm1, %xmm0 {%k1} {z}
+; X86-NEXT:    vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X86-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT:    vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16i8_identity_mask:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpermi2b %xmm0, %xmm0, %xmm1 {%k1} {z}
-; X64-NEXT:    vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-NEXT:    vpermi2b %xmm1, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT:    vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-NEXT:    vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %x0, <16 x i8> %x1, i16 %m)
   %res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 30, i8 13, i8 28, i8 11, i8 26, i8 9, i8 24, i8 7, i8 22, i8 5, i8 20, i8 3, i8 18, i8 1, i8 16>, <16 x i8> %res0, <16 x i8> %res0, i16 %m)
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index a4dc0b1..9ab03ce 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -321,110 +321,110 @@
 define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
 ; SSE2-LABEL: trunc_packus_v4i64_v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm8, %xmm3
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm5, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pandn %xmm8, %xmm4
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm1, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_packus_v4i64_v4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm3
-; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    pandn %xmm8, %xmm3
-; SSSE3-NEXT:    por %xmm0, %xmm3
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm5, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm4
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    pandn %xmm8, %xmm4
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pandn %xmm3, %xmm6
+; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pandn %xmm3, %xmm5
+; SSSE3-NEXT:    por %xmm1, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm3
+; SSSE3-NEXT:    pand %xmm5, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm4, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
-; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_packus_v4i64_v4i32:
@@ -2312,10 +2312,10 @@
 ; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSSE3-NEXT:    pand %xmm2, %xmm0
 ; SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSSE3-NEXT:    por %xmm0, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    pand %xmm2, %xmm0
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; SSSE3-NEXT:    retq
@@ -2448,19 +2448,19 @@
 ; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    pandn %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pslld $16, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pslld $16, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
 ; SSE2-NEXT:    pslld $16, %xmm0
 ; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    packssdw %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_packus_v8i32_v8i16:
@@ -2475,18 +2475,18 @@
 ; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
 ; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    pandn %xmm2, %xmm1
-; SSSE3-NEXT:    por %xmm0, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT:    pshufb %xmm2, %xmm1
-; SSSE3-NEXT:    pshufb %xmm2, %xmm0
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_packus_v8i32_v8i16:
@@ -2725,18 +2725,18 @@
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
@@ -2759,18 +2759,18 @@
 ; SSSE3-NEXT:    por %xmm2, %xmm3
 ; SSSE3-NEXT:    pand %xmm3, %xmm0
 ; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    por %xmm0, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    por %xmm3, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm0
-; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm0
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSSE3-NEXT:    retq
 ;
@@ -3043,37 +3043,37 @@
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm5, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    pandn %xmm8, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
 ; SSE2-NEXT:    pand %xmm8, %xmm3
 ; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
@@ -3109,17 +3109,17 @@
 ; SSSE3-NEXT:    por %xmm1, %xmm4
 ; SSSE3-NEXT:    pand %xmm4, %xmm0
 ; SSSE3-NEXT:    pandn %xmm8, %xmm4
-; SSSE3-NEXT:    por %xmm0, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    por %xmm4, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm5, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm4
 ; SSSE3-NEXT:    pand %xmm4, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm1
 ; SSSE3-NEXT:    pxor %xmm2, %xmm1
@@ -5097,10 +5097,10 @@
 ; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
@@ -5114,10 +5114,10 @@
 ; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSSE3-NEXT:    pand %xmm2, %xmm0
 ; SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSSE3-NEXT:    por %xmm0, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    pand %xmm2, %xmm0
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSSE3-NEXT:    retq
@@ -5126,10 +5126,9 @@
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
-; SSE41-NEXT:    packusdw %xmm1, %xmm1
-; SSE41-NEXT:    packuswb %xmm1, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm0, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_packus_v4i32_v4i8:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index e8a0ac2..2a727bc 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -313,122 +313,122 @@
 define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
 ; SSE2-LABEL: trunc_ssat_v4i64_v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm8, %xmm3
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm5, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pandn %xmm8, %xmm4
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320]
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm7, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm6, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm6
 ; SSE2-NEXT:    por %xmm4, %xmm6
-; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm1, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320]
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
 ; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_ssat_v4i64_v4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm3
-; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    pandn %xmm8, %xmm3
-; SSSE3-NEXT:    por %xmm0, %xmm3
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm5, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm4
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    pandn %xmm8, %xmm4
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; SSSE3-NEXT:    movdqa %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    pand %xmm7, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm6
-; SSSE3-NEXT:    pand %xmm6, %xmm4
-; SSSE3-NEXT:    pandn %xmm1, %xmm6
 ; SSSE3-NEXT:    por %xmm4, %xmm6
-; SSSE3-NEXT:    pxor %xmm3, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pandn %xmm3, %xmm6
+; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pandn %xmm3, %xmm5
+; SSSE3-NEXT:    por %xmm1, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm5
+; SSSE3-NEXT:    pandn %xmm1, %xmm6
+; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    pxor %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm4, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
-; SSSE3-NEXT:    pand %xmm0, %xmm3
-; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pandn %xmm1, %xmm3
 ; SSSE3-NEXT:    por %xmm3, %xmm0
 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
 ; SSSE3-NEXT:    retq
@@ -1376,61 +1376,61 @@
 define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) {
 ; SSE2-LABEL: trunc_ssat_v4i64_v4i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32767,32767]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147516415,2147516415]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm8, %xmm3
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm5, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pandn %xmm8, %xmm4
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [18446744071562035200,18446744071562035200]
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    pand %xmm7, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm6, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm6
 ; SSE2-NEXT:    por %xmm4, %xmm6
-; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm1, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200]
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
 ; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    packssdw %xmm6, %xmm0
 ; SSE2-NEXT:    packssdw %xmm0, %xmm0
@@ -1438,61 +1438,61 @@
 ;
 ; SSSE3-LABEL: trunc_ssat_v4i64_v4i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [32767,32767]
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147516415,2147516415]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm3
-; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    pandn %xmm8, %xmm3
-; SSSE3-NEXT:    por %xmm0, %xmm3
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm5, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm4
-; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    pandn %xmm8, %xmm4
-; SSSE3-NEXT:    por %xmm1, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; SSSE3-NEXT:    movdqa %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [18446744071562035200,18446744071562035200]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    pand %xmm7, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm0, %xmm6
-; SSSE3-NEXT:    pand %xmm6, %xmm4
-; SSSE3-NEXT:    pandn %xmm1, %xmm6
 ; SSSE3-NEXT:    por %xmm4, %xmm6
-; SSSE3-NEXT:    pxor %xmm3, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pandn %xmm3, %xmm6
+; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pandn %xmm3, %xmm5
+; SSSE3-NEXT:    por %xmm1, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm5
+; SSSE3-NEXT:    pandn %xmm1, %xmm6
+; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    pxor %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm4, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm0
-; SSSE3-NEXT:    pand %xmm0, %xmm3
-; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm3
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pandn %xmm1, %xmm3
 ; SSSE3-NEXT:    por %xmm3, %xmm0
 ; SSSE3-NEXT:    packssdw %xmm6, %xmm0
 ; SSSE3-NEXT:    packssdw %xmm0, %xmm0
@@ -2485,20 +2485,20 @@
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    pand %xmm3, %xmm0
 ; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [18446744071562067840,18446744071562067840]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
@@ -2521,20 +2521,20 @@
 ; SSSE3-NEXT:    por %xmm2, %xmm3
 ; SSSE3-NEXT:    pand %xmm3, %xmm0
 ; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    por %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [18446744071562067840,18446744071562067840]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm0
-; SSSE3-NEXT:    pand %xmm0, %xmm3
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSSE3-NEXT:    retq
 ;
@@ -2797,38 +2797,38 @@
 ; SSE2-NEXT:    por %xmm1, %xmm4
 ; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    pandn %xmm8, %xmm4
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    pandn %xmm8, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm0
 ; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    pandn %xmm8, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    retq
@@ -2863,38 +2863,38 @@
 ; SSSE3-NEXT:    por %xmm1, %xmm4
 ; SSSE3-NEXT:    pand %xmm4, %xmm0
 ; SSSE3-NEXT:    pandn %xmm8, %xmm4
-; SSSE3-NEXT:    por %xmm0, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSSE3-NEXT:    movdqa %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [18446744071562067840,18446744071562067840]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm6
+; SSSE3-NEXT:    movdqa %xmm4, %xmm6
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    por %xmm1, %xmm0
-; SSSE3-NEXT:    pand %xmm0, %xmm4
-; SSSE3-NEXT:    pandn %xmm8, %xmm0
-; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSSE3-NEXT:    pandn %xmm1, %xmm6
+; SSSE3-NEXT:    por %xmm6, %xmm0
 ; SSSE3-NEXT:    pxor %xmm3, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm4, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm1, %xmm3
-; SSSE3-NEXT:    pandn %xmm8, %xmm1
-; SSSE3-NEXT:    por %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT:    pshufb %xmm2, %xmm1
-; SSSE3-NEXT:    pshufb %xmm2, %xmm0
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    pandn %xmm1, %xmm4
+; SSSE3-NEXT:    por %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm1, %xmm4
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_ssat_v4i64_v4i8:
@@ -4913,12 +4913,12 @@
 ; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm2
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
@@ -4932,12 +4932,12 @@
 ; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSSE3-NEXT:    pand %xmm2, %xmm0
 ; SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSSE3-NEXT:    por %xmm0, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pandn %xmm1, %xmm2
 ; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSSE3-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index b4784ec..f61977c 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -2048,18 +2048,17 @@
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_usat_v2i64_v2i8:
@@ -2258,37 +2257,37 @@
 define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
 ; SSE2-LABEL: trunc_usat_v4i64_v4i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm0
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pandn %xmm8, %xmm0
-; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pandn %xmm8, %xmm2
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm8, %xmm2
-; SSE2-NEXT:    pand %xmm8, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    retq
@@ -3607,17 +3606,16 @@
 define <4 x i8> @trunc_usat_v4i32_v4i8(<4 x i32> %a0) {
 ; SSE2-LABEL: trunc_usat_v4i32_v4i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_usat_v4i32_v4i8:
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
index 4d37caf..7a7a0ac 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -435,16 +435,15 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv4i32:
@@ -464,16 +463,15 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv4i32:
@@ -678,16 +676,15 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    psadbw %xmm0, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv4i32u:
@@ -707,16 +704,15 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT:    psadbw %xmm0, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSE3-NEXT:    packuswb %xmm2, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    psadbw %xmm1, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    packuswb %xmm2, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv4i32u:
@@ -921,10 +917,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    retq
@@ -946,10 +942,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    retq
@@ -1097,10 +1093,10 @@
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    retq
@@ -1122,10 +1118,10 @@
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddb %xmm0, %xmm1
-; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psllw $8, %xmm0
+; SSE3-NEXT:    paddb %xmm1, %xmm0
+; SSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psllw $8, %xmm1
 ; SSE3-NEXT:    paddb %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $8, %xmm0
 ; SSE3-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vselect-packss.ll b/llvm/test/CodeGen/X86/vselect-packss.ll
index fa78263..5e88143 100644
--- a/llvm/test/CodeGen/X86/vselect-packss.ll
+++ b/llvm/test/CodeGen/X86/vselect-packss.ll
@@ -225,15 +225,14 @@
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    pcmpeqd {{[0-9]+}}(%rsp), %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    packssdw %xmm2, %xmm1
-; SSE2-NEXT:    packssdw %xmm3, %xmm1
-; SSE2-NEXT:    packsswb %xmm5, %xmm1
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    pandn {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-NEXT:    packssdw %xmm3, %xmm0
+; SSE2-NEXT:    packsswb %xmm5, %xmm0
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: vselect_packss_v16i64: