[X86] Call SimplifyDemandedBits on conditions of X86ISD::SHRUNKBLEND
This extends to combineVSelectToShrunkBlend to be able to resimplify SHRUNKBLENDS that have already been created.
This should help some of the regressions from D56387
Differential Revision: https://reviews.llvm.org/D56421
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@350875 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 742bf74..def4be5 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -33947,11 +33947,14 @@
/// If this is a *dynamic* select (non-constant condition) and we can match
/// this node with one of the variable blend instructions, restructure the
/// condition so that blends can use the high (sign) bit of each element.
+/// This function will also call SimplfiyDemandedBits on already created
+/// SHRUNKBLENDS to perform additional simplifications.
static SDValue combineVSelectToShrunkBlend(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Cond = N->getOperand(0);
- if (N->getOpcode() != ISD::VSELECT ||
+ if ((N->getOpcode() != ISD::VSELECT &&
+ N->getOpcode() != X86ISD::SHRUNKBLEND) ||
ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
@@ -33993,7 +33996,9 @@
// TODO: Add other opcodes eventually lowered into BLEND.
for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
UI != UE; ++UI)
- if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
+ if ((UI->getOpcode() != ISD::VSELECT &&
+ UI->getOpcode() != X86ISD::SHRUNKBLEND) ||
+ UI.getOperandNo() != 0)
return SDValue();
APInt DemandedMask(APInt::getSignMask(BitWidth));
@@ -34009,6 +34014,9 @@
// optimizations as we messed with the actual expectation for the vector
// boolean values.
for (SDNode *U : Cond->uses()) {
+ if (U->getOpcode() == X86ISD::SHRUNKBLEND)
+ continue;
+
SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), U->getValueType(0),
Cond, U->getOperand(1), U->getOperand(2));
DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
diff --git a/test/CodeGen/X86/avx2-logic.ll b/test/CodeGen/X86/avx2-logic.ll
index 8f2207f..f7b62ea 100644
--- a/test/CodeGen/X86/avx2-logic.ll
+++ b/test/CodeGen/X86/avx2-logic.ll
@@ -91,14 +91,12 @@
; X32-LABEL: vpblendvb:
; X32: # %bb.0:
; X32-NEXT: vpsllw $7, %ymm0, %ymm0
-; X32-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vpblendvb:
; X64: # %bb.0:
; X64-NEXT: vpsllw $7, %ymm0, %ymm0
-; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
; X64-NEXT: retq
%min = select <32 x i1> %cond, <32 x i8> %x, <32 x i8> %y
diff --git a/test/CodeGen/X86/vector-reduce-smax.ll b/test/CodeGen/X86/vector-reduce-smax.ll
index 19ac789..162af26 100644
--- a/test/CodeGen/X86/vector-reduce-smax.ll
+++ b/test/CodeGen/X86/vector-reduce-smax.ll
@@ -709,17 +709,16 @@
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; SSE41-NEXT: psrad $31, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movd %xmm3, %eax
; SSE41-NEXT: retq
@@ -1171,12 +1170,11 @@
; SSE41-NEXT: pxor %xmm0, %xmm2
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
@@ -1658,12 +1656,11 @@
; SSE41-NEXT: pxor %xmm0, %xmm2
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
diff --git a/test/CodeGen/X86/vector-reduce-smin.ll b/test/CodeGen/X86/vector-reduce-smin.ll
index 0b09c94..b27812e 100644
--- a/test/CodeGen/X86/vector-reduce-smin.ll
+++ b/test/CodeGen/X86/vector-reduce-smin.ll
@@ -708,17 +708,16 @@
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; SSE41-NEXT: psrad $31, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movd %xmm3, %eax
; SSE41-NEXT: retq
@@ -1165,17 +1164,16 @@
; SSE41-NEXT: psrad $16, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
@@ -1652,17 +1650,16 @@
; SSE41-NEXT: psrad $24, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
diff --git a/test/CodeGen/X86/vector-reduce-umax.ll b/test/CodeGen/X86/vector-reduce-umax.ll
index 43cf9a9..32ea3f5 100644
--- a/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/test/CodeGen/X86/vector-reduce-umax.ll
@@ -760,12 +760,11 @@
; SSE41-NEXT: pxor %xmm0, %xmm3
; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: retq
@@ -1248,12 +1247,11 @@
; SSE41-NEXT: por %xmm0, %xmm3
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
@@ -1803,12 +1801,11 @@
; SSE41-NEXT: por %xmm3, %xmm4
; SSE41-NEXT: por %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: pextrb $0, %xmm2, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
diff --git a/test/CodeGen/X86/vector-reduce-umin.ll b/test/CodeGen/X86/vector-reduce-umin.ll
index ece9684..e3adb9f 100644
--- a/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/test/CodeGen/X86/vector-reduce-umin.ll
@@ -754,17 +754,16 @@
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pxor %xmm0, %xmm3
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: retq
@@ -1242,17 +1241,16 @@
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
@@ -1704,17 +1702,16 @@
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: pextrb $0, %xmm2, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax